diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml index 0f50eb47607cd4..805413d79aae2c 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yaml +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -7,6 +7,7 @@ body: - type: checkboxes id: checks attributes: + label: Pandas version checks options: - label: > I have checked that this issue has not already been reported. diff --git a/.github/ISSUE_TEMPLATE/documentation_improvement.yaml b/.github/ISSUE_TEMPLATE/documentation_improvement.yaml index 97897c3224f91c..8486d6e3eebdcb 100644 --- a/.github/ISSUE_TEMPLATE/documentation_improvement.yaml +++ b/.github/ISSUE_TEMPLATE/documentation_improvement.yaml @@ -6,6 +6,7 @@ labels: [Docs, Needs Triage] body: - type: checkboxes attributes: + label: Pandas version checks options: - label: > I have checked that the issue still exists on the latest versions of the docs diff --git a/.github/ISSUE_TEMPLATE/installation_issue.yaml b/.github/ISSUE_TEMPLATE/installation_issue.yaml index d5db0d1c83a41e..a80269ff0f12d6 100644 --- a/.github/ISSUE_TEMPLATE/installation_issue.yaml +++ b/.github/ISSUE_TEMPLATE/installation_issue.yaml @@ -7,6 +7,7 @@ body: - type: checkboxes id: checks attributes: + label: Installation check options: - label: > I have read the [installation guide](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-pandas). diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yaml b/.github/ISSUE_TEMPLATE/performance_issue.yaml index 2dcfc94f4a604f..9cde5b6dca385f 100644 --- a/.github/ISSUE_TEMPLATE/performance_issue.yaml +++ b/.github/ISSUE_TEMPLATE/performance_issue.yaml @@ -7,6 +7,7 @@ body: - type: checkboxes id: checks attributes: + label: Pandas version checks options: - label: > I have checked that this issue has not already been reported. diff --git a/.github/ISSUE_TEMPLATE/submit_question.yml b/.github/ISSUE_TEMPLATE/submit_question.yml index b227c9970f29ed..6f73041b0f527c 100644 --- a/.github/ISSUE_TEMPLATE/submit_question.yml +++ b/.github/ISSUE_TEMPLATE/submit_question.yml @@ -11,6 +11,7 @@ body: usage questions, we ask that all usage questions are first asked on StackOverflow. - type: checkboxes attributes: + label: Research options: - label: > I have searched the [[pandas] tag](https://stackoverflow.com/questions/tagged/pandas) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e0e898600ba954..0bb82ed5d6816e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -78,6 +78,40 @@ jobs: run: pytest scripts if: always() + benchmarks: + name: Benchmarks + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-benchmarks + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Cache conda + uses: actions/cache@v2 + with: + path: ~/conda_pkgs_dir + key: ${{ runner.os }}-conda-${{ hashFiles('${{ env.ENV_FILE }}') }} + + - uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: pandas-dev + channel-priority: strict + environment-file: ${{ env.ENV_FILE }} + use-only-tar-bz2: true + + - name: Build Pandas + uses: ./.github/actions/build_pandas + - name: Running benchmarks run: | cd asv_bench diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index 19b96c866923fb..9839eacc046f9d 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -31,12 +31,12 @@ jobs: [actions-38-slow.yaml, "slow", "", "", "", "", ""], [actions-38-locale.yaml, "not slow and not network", "language-pack-zh-hans xsel", "zh_CN.utf8", "zh_CN.utf8", "", ""], [actions-39-slow.yaml, "slow", "", "", "", "", ""], + [actions-pypy-38.yaml, "not slow and not clipboard", "", "", "", "", "--max-worker-restart 0"], [actions-39-numpydev.yaml, "not slow and not network", "xsel", "", "", "deprecate", "-W error"], [actions-39.yaml, "not slow and not clipboard", "", "", "", "", ""] ] fail-fast: false env: - COVERAGE: true ENV_FILE: ci/deps/${{ matrix.settings[0] }} PATTERN: ${{ matrix.settings[1] }} EXTRA_APT: ${{ matrix.settings[2] }} @@ -45,6 +45,9 @@ jobs: PANDAS_TESTING_MODE: ${{ matrix.settings[5] }} TEST_ARGS: ${{ matrix.settings[6] }} PYTEST_TARGET: pandas + IS_PYPY: ${{ contains(matrix.settings[0], 'pypy') }} + # TODO: re-enable coverage on pypy, its slow + COVERAGE: ${{ !contains(matrix.settings[0], 'pypy') }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.settings[0] }} @@ -82,12 +85,29 @@ jobs: channel-priority: flexible environment-file: ${{ env.ENV_FILE }} use-only-tar-bz2: true + if: ${{ env.IS_PYPY == 'false' }} # No pypy3.8 support + + - name: Setup PyPy + uses: actions/setup-python@v2.3.1 + with: + python-version: "pypy-3.8" + if: ${{ env.IS_PYPY == 'true' }} + + - name: Setup PyPy dependencies + shell: bash + run: | + # TODO: re-enable cov, its slowing the tests down though + # TODO: Unpin Cython, the new Cython 0.29.26 is causing compilation errors + pip install Cython==0.29.25 numpy python-dateutil pytz pytest>=6.0 pytest-xdist>=1.31.0 hypothesis>=5.5.3 + if: ${{ env.IS_PYPY == 'true' }} - name: Build Pandas uses: ./.github/actions/build_pandas - name: Test run: ci/run_tests.sh + # TODO: Don't continue on error for PyPy + continue-on-error: ${{ env.IS_PYPY == 'true' }} if: always() - name: Build Version diff --git a/.gitignore b/.gitignore index 2c337be60e94ec..87224f1d6060f4 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,8 @@ dist *.egg-info .eggs .pypirc +# type checkers +pandas/py.typed # tox testing tool .tox diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index bfb1be87054957..edd1132116f76d 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -144,7 +144,7 @@ def setup(self, op, shape): # should already be the case, but just to be sure df._consolidate_inplace() - # TODO: GH#33198 the setting here shoudlnt need two steps + # TODO: GH#33198 the setting here shouldn't need two steps arr1 = np.random.randn(n_rows, max(n_cols // 4, 3)).astype("f8") arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8") arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8") diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 39cc09d32981e6..0b443b29116a26 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -55,6 +55,26 @@ def time_frame(self, kind): self.df.to_csv(self.fname) +class ToCSVMultiIndexUnusedLevels(BaseIO): + + fname = "__test__.csv" + + def setup(self): + df = DataFrame({"a": np.random.randn(100_000), "b": 1, "c": 1}) + self.df = df.set_index(["a", "b"]) + self.df_unused_levels = self.df.iloc[:10_000] + self.df_single_index = df.set_index(["a"]).iloc[:10_000] + + def time_full_frame(self): + self.df.to_csv(self.fname) + + def time_sliced_frame(self): + self.df_unused_levels.to_csv(self.fname) + + def time_single_index_frame(self): + self.df_single_index.to_csv(self.fname) + + class ToCSVDatetime(BaseIO): fname = "__test__.csv" @@ -67,6 +87,21 @@ def time_frame_date_formatting(self): self.data.to_csv(self.fname, date_format="%Y%m%d") +class ToCSVDatetimeIndex(BaseIO): + + fname = "__test__.csv" + + def setup(self): + rng = date_range("2000", periods=100_000, freq="S") + self.data = DataFrame({"a": 1}, index=rng) + + def time_frame_date_formatting_index(self): + self.data.to_csv(self.fname, date_format="%Y-%m-%d %H:%M:%S") + + def time_frame_date_no_format_index(self): + self.data.to_csv(self.fname) + + class ToCSVDatetimeBig(BaseIO): fname = "__test__.csv" diff --git a/ci/deps/actions-38-db.yaml b/ci/deps/actions-38-db.yaml index 1a4e5d12f70dfb..c08c642049b410 100644 --- a/ci/deps/actions-38-db.yaml +++ b/ci/deps/actions-38-db.yaml @@ -12,7 +12,7 @@ dependencies: - pytest-cov>=2.10.1 # this is only needed in the coverage build, ref: GH 35737 # pandas dependencies - - aiobotocore<2.0.0 + - aiobotocore<2.0.0 # GH#44311 pinned to fix docbuild - beautifulsoup4 - boto3 - botocore>=1.11 diff --git a/ci/deps/actions-pypy-38.yaml b/ci/deps/actions-pypy-38.yaml new file mode 100644 index 00000000000000..ad05d2ab2daccb --- /dev/null +++ b/ci/deps/actions-pypy-38.yaml @@ -0,0 +1,20 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + # TODO: Add the rest of the dependencies in here + # once the other plentiful failures/segfaults + # with base pandas has been dealt with + - python=3.8[build=*_pypy] # TODO: use this once pypy3.8 is available + + # tools + - cython>=0.29.24 + - pytest>=6.0 + - pytest-cov + - pytest-xdist>=1.31 + - hypothesis>=5.5.3 + + # required + - numpy + - python-dateutil + - pytz diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 9fea696b6ea810..203f8fe293a066 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -5,12 +5,17 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +# May help reproduce flaky CI builds if set in subsequent runs +echo PYTHONHASHSEED=$PYTHONHASHSEED + if [[ "not network" == *"$PATTERN"* ]]; then export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; fi -if [ "$COVERAGE" ]; then +if [[ "$COVERAGE" == "true" ]]; then COVERAGE="-s --cov=pandas --cov-report=xml --cov-append" +else + COVERAGE="" # We need to reset this for COVERAGE="false" case fi # If no X server is found, we use xvfb to emulate it diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 4cea030546635e..41fe88e02318a0 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -303,7 +303,7 @@ pandas strongly encourages the use of :pep:`484` style type hints. New developme Style guidelines ~~~~~~~~~~~~~~~~ -Types imports should follow the ``from typing import ...`` convention. So rather than +Type imports should follow the ``from typing import ...`` convention. Some types do not need to be imported since :pep:`585` some builtin constructs, such as ``list`` and ``tuple``, can directly be used for type annotations. So rather than .. code-block:: python @@ -315,21 +315,31 @@ You should write .. code-block:: python - from typing import List, Optional, Union + primes: list[int] = [] - primes: List[int] = [] +``Optional`` should be avoided in favor of the shorter ``| None``, so instead of -``Optional`` should be used where applicable, so instead of +.. code-block:: python + + from typing import Union + + maybe_primes: list[Union[int, None]] = [] + +or .. code-block:: python - maybe_primes: List[Union[int, None]] = [] + from typing import Optional + + maybe_primes: list[Optional[int]] = [] You should write .. code-block:: python - maybe_primes: List[Optional[int]] = [] + from __future__ import annotations # noqa: F404 + + maybe_primes: list[int | None] = [] In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like @@ -410,6 +420,26 @@ A recent version of ``numpy`` (>=1.21.0) is required for type validation. .. _contributing.ci: +Testing type hints in code using pandas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + * Pandas is not yet a py.typed library (:pep:`561`)! + The primary purpose of locally declaring pandas as a py.typed library is to test and + improve the pandas-builtin type annotations. + +Until pandas becomes a py.typed library, it is possible to easily experiment with the type +annotations shipped with pandas by creating an empty file named "py.typed" in the pandas +installation folder: + +.. code-block:: none + + python -c "import pandas; import pathlib; (pathlib.Path(pandas.__path__[0]) / 'py.typed').touch()" + +The existence of the py.typed file signals to type checkers that pandas is already a py.typed +library. This makes type checkers aware of the type annotations shipped with pandas. + Testing with continuous integration ----------------------------------- diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index d701208792a4c3..6de237b70f08d1 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -180,7 +180,7 @@ As an example of fully-formed metadata: 'numpy_type': 'int64', 'metadata': None} ], - 'pandas_version': '0.20.0', + 'pandas_version': '1.4.0', 'creator': { 'library': 'pyarrow', 'version': '0.13.0' diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index ccf130d03418c3..2bb0659264eb0a 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -122,6 +122,7 @@ application to columns of a specific data type. DataFrameGroupBy.skew DataFrameGroupBy.take DataFrameGroupBy.tshift + DataFrameGroupBy.value_counts The following methods are available only for ``SeriesGroupBy`` objects. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cd7105d1259470..403599297a4927 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1903,6 +1903,7 @@ with optional parameters: ``index``; dict like {index -> {column -> value}} ``columns``; dict like {column -> {index -> value}} ``values``; just the values array + ``table``; adhering to the JSON `Table Schema`_ * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. @@ -2477,7 +2478,6 @@ A few notes on the generated table schema: * For ``MultiIndex``, ``mi.names`` is used. If any level has no name, then ``level_`` is used. - ``read_json`` also accepts ``orient='table'`` as an argument. This allows for the preservation of metadata such as dtypes and index names in a round-trippable manner. @@ -2519,8 +2519,18 @@ indicate missing values and the subsequent read cannot distinguish the intent. os.remove("test.json") +When using ``orient='table'`` along with user-defined ``ExtensionArray``, +the generated schema will contain an additional ``extDtype`` key in the respective +``fields`` element. This extra key is not standard but does enable JSON roundtrips +for extension types (e.g. ``read_json(df.to_json(orient="table"), orient="table")``). + +The ``extDtype`` key carries the name of the extension, if you have properly registered +the ``ExtensionDtype``, pandas will use said name to perform a lookup into the registry +and re-convert the serialized data into your custom dtype. + .. _Table Schema: https://specs.frictionlessdata.io/table-schema/ + HTML ---- diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index fde9ff0450a129..3fd6fe67772bcf 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -2424,7 +2424,7 @@ you can use the ``tz_convert`` method. For ``pytz`` time zones, it is incorrect to pass a time zone object directly into the ``datetime.datetime`` constructor - (e.g., ``datetime.datetime(2011, 1, 1, tz=pytz.timezone('US/Eastern'))``. + (e.g., ``datetime.datetime(2011, 1, 1, tzinfo=pytz.timezone('US/Eastern'))``. Instead, the datetime needs to be localized using the ``localize`` method on the ``pytz`` time zone object. diff --git a/doc/source/whatsnew/v1.3.5.rst b/doc/source/whatsnew/v1.3.5.rst index 048cd978c44785..339bd7debf9456 100644 --- a/doc/source/whatsnew/v1.3.5.rst +++ b/doc/source/whatsnew/v1.3.5.rst @@ -1,6 +1,6 @@ .. _whatsnew_135: -What's new in 1.3.5 (November ??, 2021) +What's new in 1.3.5 (December 12, 2021) --------------------------------------- These are the changes in pandas 1.3.5. See :ref:`release` for a full changelog @@ -16,29 +16,13 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`Series.equals` when comparing floats with dtype object to None (:issue:`44190`) - Fixed regression in :func:`merge_asof` raising error when array was supplied as join key (:issue:`42844`) +- Fixed regression when resampling :class:`DataFrame` with :class:`DateTimeIndex` with empty groups and ``uint8``, ``uint16`` or ``uint32`` columns incorrectly raising ``RuntimeError`` (:issue:`43329`) - Fixed regression in creating a :class:`DataFrame` from a timezone-aware :class:`Timestamp` scalar near a Daylight Savings Time transition (:issue:`42505`) - Fixed performance regression in :func:`read_csv` (:issue:`44106`) - Fixed regression in :meth:`Series.duplicated` and :meth:`Series.drop_duplicates` when Series has :class:`Categorical` dtype with boolean categories (:issue:`44351`) - Fixed regression in :meth:`.GroupBy.sum` with ``timedelta64[ns]`` dtype containing ``NaT`` failing to treat that value as NA (:issue:`42659`) -- +- Fixed regression in :meth:`.RollingGroupby.cov` and :meth:`.RollingGroupby.corr` when ``other`` had the same shape as each group would incorrectly return superfluous groups in the result (:issue:`42915`) -.. --------------------------------------------------------------------------- - -.. _whatsnew_135.bug_fixes: - -Bug fixes -~~~~~~~~~ -- -- - -.. --------------------------------------------------------------------------- - -.. _whatsnew_135.other: - -Other -~~~~~ -- -- .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 579d9b153d800e..9001c1ea1e7d1c 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -217,9 +217,10 @@ Other enhancements - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`) - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`) - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) +- Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`) - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`) - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) -- :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) +- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) - :meth:`NaT.to_numpy` ``dtype`` argument is now respected, so ``np.timedelta64`` can be returned (:issue:`44460`) - New option ``display.max_dir_items`` customizes the number of columns added to :meth:`Dataframe.__dir__` and suggested for tab completion (:issue:`37996`) @@ -229,6 +230,10 @@ Other enhancements - :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) - Implemented :meth:`IntervalArray.min`, :meth:`IntervalArray.max`, as a result of which ``min`` and ``max`` now work for :class:`IntervalIndex`, :class:`Series` and :class:`DataFrame` with ``IntervalDtype`` (:issue:`44746`) - :meth:`UInt64Index.map` now retains ``dtype`` where possible (:issue:`44609`) +- :meth:`read_json` can now parse unsigned long long integers (:issue:`26068`) +- :meth:`DataFrame.take` now raises a ``TypeError`` when passed a scalar for the indexer (:issue:`42875`) +- :meth:`is_list_like` now identifies duck-arrays as list-like unless ``.ndim == 0`` (:issue:`35131`) +- :class:`ExtensionDtype` and :class:`ExtensionArray` are now (de)serialized when exporting a :class:`DataFrame` with :meth:`DataFrame.to_json` using ``orient='table'`` (:issue:`20612`, :issue:`44705`). - @@ -452,6 +457,7 @@ Other API changes - :meth:`Index.get_indexer_for` no longer accepts keyword arguments (other than 'target'); in the past these would be silently ignored if the index was not unique (:issue:`42310`) - Change in the position of the ``min_rows`` argument in :meth:`DataFrame.to_string` due to change in the docstring (:issue:`44304`) - Reduction operations for :class:`DataFrame` or :class:`Series` now raising a ``ValueError`` when ``None`` is passed for ``skipna`` (:issue:`44178`) +- :func:`read_csv` and :func:`read_html` no longer raising an error when one of the header rows consists only of ``Unnamed:`` columns (:issue:`13054`) - Changed the ``name`` attribute of several holidays in ``USFederalHolidayCalendar`` to match `official federal holiday names `_ @@ -527,7 +533,7 @@ Other Deprecations - Deprecated silent dropping of columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a dictionary (:issue:`43740`) - Deprecated silent dropping of columns that raised a ``TypeError``, ``DataError``, and some cases of ``ValueError`` in :meth:`Series.aggregate`, :meth:`DataFrame.aggregate`, :meth:`Series.groupby.aggregate`, and :meth:`DataFrame.groupby.aggregate` when used with a list (:issue:`43740`) - Deprecated casting behavior when setting timezone-aware value(s) into a timezone-aware :class:`Series` or :class:`DataFrame` column when the timezones do not match. Previously this cast to object dtype. In a future version, the values being inserted will be converted to the series or column's existing timezone (:issue:`37605`) -- Deprecated casting behavior when passing an item with mismatched-timezone to :meth:`DatetimeIndex.insert`, :meth:`DatetimeIndex.putmask`, :meth:`DatetimeIndex.where` :meth:`DatetimeIndex.fillna`, :meth:`Series.mask`, :meth:`Series.where`, :meth:`Series.fillna`, :meth:`Series.shift`, :meth:`Series.replace`, :meth:`Series.reindex` (and :class:`DataFrame` column analogues). In the past this has cast to object dtype. In a future version, these will cast the passed item to the index or series's timezone (:issue:`37605`) +- Deprecated casting behavior when passing an item with mismatched-timezone to :meth:`DatetimeIndex.insert`, :meth:`DatetimeIndex.putmask`, :meth:`DatetimeIndex.where` :meth:`DatetimeIndex.fillna`, :meth:`Series.mask`, :meth:`Series.where`, :meth:`Series.fillna`, :meth:`Series.shift`, :meth:`Series.replace`, :meth:`Series.reindex` (and :class:`DataFrame` column analogues). In the past this has cast to object dtype. In a future version, these will cast the passed item to the index or series's timezone (:issue:`37605`,:issue:`44940`) - Deprecated the 'errors' keyword argument in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, and meth:`DataFrame.mask`; in a future version the argument will be removed (:issue:`44294`) - Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`) - Deprecated :meth:`PeriodIndex.astype` to ``datetime64[ns]`` or ``DatetimeTZDtype``, use ``obj.to_timestamp(how).tz_localize(dtype.tz)`` instead (:issue:`44398`) @@ -535,8 +541,11 @@ Other Deprecations - Deprecated passing arguments as positional for :func:`read_fwf` other than ``filepath_or_buffer`` (:issue:`41485`): - Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`) - Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`) +- Deprecated parameter ``names`` in :meth:`Index.copy` (:issue:`44916`) - A deprecation warning is now shown for :meth:`DataFrame.to_latex` indicating the arguments signature may change and emulate more the arguments to :meth:`.Styler.to_latex` in future versions (:issue:`44411`) - Deprecated direct passing non boolean or non nan value to ``fill_value`` for :class:`SparseDType` when dtype is bool type (:pull:`44955`) +- Deprecated :meth:`Categorical.replace`, use :meth:`Series.replace` instead (:issue:`44929`) +- Deprecated :meth:`Index.__getitem__` with a bool key; use ``index.values[key]`` to get the old behavior (:issue:`44051`) - .. --------------------------------------------------------------------------- @@ -548,7 +557,7 @@ Performance improvements - Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`) - Performance improvement when converting non-string arrays to string arrays (:issue:`34483`) - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`) -- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`, :issue:`43142`, :issue:`43147`, :issue:`43307`, :issue:`43144`) +- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`, :issue:`43142`, :issue:`43147`, :issue:`43307`, :issue:`43144`, :issue:`44826`) - Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`) - Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`) - Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`, :issue:`43578`) @@ -588,6 +597,8 @@ Performance improvements - Performance improvement in :meth:`Series.to_frame` (:issue:`43558`) - Performance improvement in :meth:`Series.mad` (:issue:`43010`) - Performance improvement in :func:`merge` (:issue:`43332`) +- Performance improvement in :func:`to_csv` when index column is a datetime and is formatted (:issue:`39413`) +- Performance improvement in :func:`to_csv` when :class:`MultiIndex` contains a lot of unused levels (:issue:`37484`) - Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`) - Performance improvement in :func:`concat` (:issue:`43354`) - Performance improvement in :meth:`SparseArray.__getitem__` (:issue:`23122`) @@ -623,6 +634,7 @@ Datetimelike - Bug in adding a ``np.timedelta64`` object to a :class:`BusinessDay` or :class:`CustomBusinessDay` object incorrectly raising (:issue:`44532`) - Bug in :meth:`Index.insert` for inserting ``np.datetime64``, ``np.timedelta64`` or ``tuple`` into :class:`Index` with ``dtype='object'`` with negative loc adding ``None`` and replacing existing value (:issue:`44509`) - Bug in :meth:`Series.mode` with ``DatetimeTZDtype`` incorrectly returning timezone-naive and ``PeriodDtype`` incorrectly raising (:issue:`41927`) +- Bug in :class:`DateOffset`` addition with :class:`Timestamp` where ``offset.nanoseconds`` would not be included in the result. (:issue:`43968`) - Timedelta @@ -646,6 +658,7 @@ Numeric - Bug in arithmetic operations involving :class:`RangeIndex` where the result would have the incorrect ``name`` (:issue:`43962`) - Bug in arithmetic operations involving :class:`Series` where the result could have the incorrect ``name`` when the operands having matching NA or matching tuple names (:issue:`44459`) - Bug in division with ``IntegerDtype`` or ``BooleanDtype`` array and NA scalar incorrectly raising (:issue:`44685`) +- Bug in multiplying a :class:`Series` with ``FloatingDtype`` with a timedelta-like scalar incorrectly raising (:issue:`44772`) - Conversion @@ -659,7 +672,7 @@ Conversion Strings ^^^^^^^ -- +- Fixed bug in checking for ``string[pyarrow]`` dtype incorrectly raising an ImportError when pyarrow is not installed (:issue:`44276`) - Interval @@ -698,8 +711,10 @@ Indexing - Bug in :meth:`DataFrame.loc.__getitem__` incorrectly raising ``KeyError`` when selecting a single column with a boolean key (:issue:`44322`). - Bug in setting :meth:`DataFrame.iloc` with a single ``ExtensionDtype`` column and setting 2D values e.g. ``df.iloc[:] = df.values`` incorrectly raising (:issue:`44514`) - Bug in indexing on columns with ``loc`` or ``iloc`` using a slice with a negative step with ``ExtensionDtype`` columns incorrectly raising (:issue:`44551`) +- Bug in :meth:`DataFrame.loc.__setitem__` changing dtype when indexer was completely ``False`` (:issue:`37550`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` not handling targets of ``dtype`` 'object' with NaNs correctly (:issue:`44482`) +- Fixed regression where a single column ``np.matrix`` was no longer coerced to a 1d ``np.ndarray`` when added to a :class:`DataFrame` (:issue:`42376`) - Missing @@ -708,6 +723,7 @@ Missing - Bug in :meth:`DataFrame.fillna` not replacing missing values when using a dict-like ``value`` and duplicate column names (:issue:`43476`) - Bug in constructing a :class:`DataFrame` with a dictionary ``np.datetime64`` as a value and ``dtype='timedelta64[ns]'``, or vice-versa, incorrectly casting instead of raising (:issue:`??`) - Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` with ``inplace=True`` not writing to the underlying array(s) in-place (:issue:`44749`) +- Bug in :meth:`Index.fillna` incorrectly returning an un-filled :class:`Index` when NA values are present and ``downcast`` argument is specified. This now raises ``NotImplementedError`` instead; do not pass ``downcast`` argument (:issue:`44873`) - MultiIndex @@ -715,6 +731,7 @@ MultiIndex - Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`) - Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`) - Bug in :meth:`MultiIndex.get_loc` raising ``TypeError`` instead of ``KeyError`` on nested tuple (:issue:`42440`) +- Bug in :meth:`MultiIndex.union` setting wrong ``sortorder`` causing errors in subsequent indexing operations with slices (:issue:`44752`) - Bug in :meth:`MultiIndex.putmask` where the other value was also a :class:`MultiIndex` (:issue:`43212`) - @@ -744,17 +761,24 @@ I/O - Bug in :func:`read_csv` raising ``ValueError`` when names was longer than header but equal to data rows for ``engine="python"`` (:issue:`38453`) - Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`) - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) +- Bug in :func:`read_csv` not raising an ``ValueError`` when ``\n`` was specified as ``delimiter`` or ``sep`` which conflicts with ``lineterminator`` (:issue:`43528`) - Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`) - Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`) - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) +- Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`, :issue:`34120`) - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`) +- Bug in :func:`read_csv` not setting name of :class:`MultiIndex` columns correctly when ``index_col`` is not the first column (:issue:`38549`) +- Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`) +- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) +- Period ^^^^^^ - Bug in adding a :class:`Period` object to a ``np.timedelta64`` object incorrectly raising ``TypeError`` (:issue:`44182`) - Bug in :meth:`PeriodIndex.to_timestamp` when the index has ``freq="B"`` inferring ``freq="D"`` for its result instead of ``freq="B"`` (:issue:`44105`) - Bug in :class:`Period` constructor incorrectly allowing ``np.timedelta64("NaT")`` (:issue:`44507`) +- Bug in :meth:`PeriodIndex.to_timestamp` giving incorrect values for indexes with non-contiguous data (:issue:`44100`) - Plotting @@ -780,6 +804,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.mean` failing with ``complex`` dtype (:issue:`43701`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly for the first row when ``center=True`` and index is decreasing (:issue:`43927`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` for centered datetimelike windows with uneven nanosecond (:issue:`43997`) +- Bug in :meth:`GroupBy.mean` raising ``KeyError`` when column was selected at least twice (:issue:`44924`) - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`3944`) - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` when using a :class:`pandas.api.indexers.BaseIndexer` subclass that returned unequal start and end arrays would segfault instead of raising a ``ValueError`` (:issue:`44470`) @@ -798,6 +823,7 @@ Reshaping - Bug in :func:`crosstab` would fail when inputs are lists or tuples (:issue:`44076`) - Bug in :meth:`DataFrame.append` failing to retain ``index.name`` when appending a list of :class:`Series` objects (:issue:`44109`) - Fixed metadata propagation in :meth:`Dataframe.apply` method, consequently fixing the same issue for :meth:`Dataframe.transform`, :meth:`Dataframe.nunique` and :meth:`Dataframe.mode` (:issue:`28283`) +- Bug in :func:`concat` casting levels of :class:`MultiIndex` to float if the only consist of missing values (:issue:`44900`) - Bug in :meth:`DataFrame.stack` with ``ExtensionDtype`` columns incorrectly raising (:issue:`43561`) - Bug in :meth:`Series.unstack` with object doing unwanted type inference on resulting columns (:issue:`44595`) - Bug in :class:`MultiIndex` failing join operations with overlapping ``IntervalIndex`` levels (:issue:`44096`) @@ -810,6 +836,7 @@ Sparse - Bug in :meth:`DataFrame.sparse.to_coo` silently converting non-zero fill values to zero (:issue:`24817`) - Bug in :class:`SparseArray` comparison methods with an array-like operand of mismatched length raising ``AssertionError`` or unclear ``ValueError`` depending on the input (:issue:`43863`) - Bug in :class:`SparseArray` unary methods as well as :meth:`SparseArray.isna` doesn't recalculate indexes (:pull:`44955`) +- Bug in :class:`SparseArray` arithmetic methods ``floordiv`` and ``mod`` behaviors when dividing by zero not matching the non-sparse :class:`Series` behavior (:issue:`38172`) - ExtensionArray @@ -823,7 +850,7 @@ ExtensionArray - Bug in :func:`array` incorrectly raising when passed a ``ndarray`` with ``float16`` dtype (:issue:`44715`) - Bug in calling ``np.sqrt`` on :class:`BooleanArray` returning a malformed :class:`FloatingArray` (:issue:`44715`) - Bug in :meth:`Series.where` with ``ExtensionDtype`` when ``other`` is a NA scalar incompatible with the series dtype (e.g. ``NaT`` with a numeric dtype) incorrectly casting to a compatible NA value (:issue:`44697`) -- +- Fixed bug in :meth:`Series.replace` with ``FloatDtype``, ``string[python]``, or ``string[pyarrow]`` dtype not being preserved when possible (:issue:`33484`) Styler ^^^^^^ @@ -849,6 +876,7 @@ Other - Bug in :meth:`DataFrame.shift` with ``axis=1`` and ``ExtensionDtype`` columns incorrectly raising when an incompatible ``fill_value`` is passed (:issue:`44564`) - Bug in :meth:`DataFrame.diff` when passing a NumPy integer object instead of an ``int`` object (:issue:`44572`) - Bug in :meth:`Series.replace` raising ``ValueError`` when using ``regex=True`` with a :class:`Series` containing ``np.nan`` values (:issue:`43344`) +- Bug in :meth:`DataFrame.to_records` where an incorrect ``n`` was used when missing names were replaced by ``level_n`` (:issue:`44818`) .. ***DO NOT USE THIS SECTION*** diff --git a/environment.yml b/environment.yml index b4a8b977359cb8..30d05ab7700ff4 100644 --- a/environment.yml +++ b/environment.yml @@ -105,7 +105,7 @@ dependencies: - pytables>=3.6.1 # pandas.read_hdf, DataFrame.to_hdf - s3fs>=0.4.0 # file IO when using 's3://...' path - - aiobotocore<2.0.0 + - aiobotocore<2.0.0 # GH#44311 pinned to fix docbuild - fsspec>=0.7.4 # for generic remote file operations - gcsfs>=0.6.0 # file IO when using 'gcs://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql diff --git a/pandas/__init__.py b/pandas/__init__.py index 9505d0481ee197..6ee0cf5ae07d5f 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -23,13 +23,15 @@ try: from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib -except ImportError as e: # pragma: no cover - module = e.name +except ImportError as err: # pragma: no cover + module = err.name raise ImportError( f"C extension: {module} not built. If you want to import " "pandas from the source directory, you may need to run " "'python setup.py build_ext --force' to build the C extensions first." - ) from e + ) from err +else: + del _tslib, _lib, _hashtable from pandas._config import ( get_option, diff --git a/pandas/_config/config.py b/pandas/_config/config.py index b22a6840644ecd..d8b1829840a4d4 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -333,7 +333,7 @@ def __doc__(self): Prints the description for one or more registered options. -Call with not arguments to get a listing for all registered options. +Call with no arguments to get a listing for all registered options. Available options: diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 9d5922f8a50bd5..aba635e19995a2 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -516,9 +516,9 @@ def intervals_to_interval_bounds(ndarray intervals, bint validate_closed=True): Returns ------- - tuple of tuples - left : (ndarray, object, array) - right : (ndarray, object, array) + tuple of + left : ndarray + right : ndarray closed: str """ cdef: diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi index 3a22aa439b7be7..a5e91e2ce83eb7 100644 --- a/pandas/_libs/join.pyi +++ b/pandas/_libs/join.pyi @@ -55,7 +55,7 @@ def asof_join_backward_on_X_by_Y( left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] allow_exact_matches: bool = ..., - tolerance=..., + tolerance: np.number | int | float | None = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_forward_on_X_by_Y( left_values: np.ndarray, # asof_t[:] @@ -63,7 +63,7 @@ def asof_join_forward_on_X_by_Y( left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] allow_exact_matches: bool = ..., - tolerance=..., + tolerance: np.number | int | float | None = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_nearest_on_X_by_Y( left_values: np.ndarray, # asof_t[:] @@ -71,23 +71,23 @@ def asof_join_nearest_on_X_by_Y( left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] allow_exact_matches: bool = ..., - tolerance=..., + tolerance: np.number | int | float | None = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_backward( left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] allow_exact_matches: bool = ..., - tolerance=..., + tolerance: np.number | int | float | None = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_forward( left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] allow_exact_matches: bool = ..., - tolerance=..., + tolerance: np.number | int | float | None = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_nearest( left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] allow_exact_matches: bool = ..., - tolerance=..., + tolerance: np.number | int | float | None = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f527882a9dc9de..0814a3a1354f01 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1098,13 +1098,20 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1: + # first, performance short-cuts for the most common cases + if util.is_array(obj): + # exclude zero-dimensional numpy arrays, effectively scalars + return not cnp.PyArray_IsZeroDim(obj) + elif isinstance(obj, list): + return True + # then the generic implementation return ( # equiv: `isinstance(obj, abc.Iterable)` getattr(obj, "__iter__", None) is not None and not isinstance(obj, type) # we do not count strings/unicode/bytes as list-like and not isinstance(obj, (str, bytes)) - # exclude zero-dimensional numpy arrays, effectively scalars - and not cnp.PyArray_IsZeroDim(obj) + # exclude zero-dimensional duck-arrays, effectively scalars + and not (hasattr(obj, "ndim") and obj.ndim == 0) # exclude sets if allow_sets is False and not (allow_sets is False and isinstance(obj, abc.Set)) ) diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi new file mode 100644 index 00000000000000..1177e829061900 --- /dev/null +++ b/pandas/_libs/missing.pyi @@ -0,0 +1,15 @@ +import numpy as np +from numpy import typing as npt + +class NAType: ... + +NA: NAType + +def is_matching_na( + left: object, right: object, nan_matches_none: bool = ... +) -> bool: ... +def isposinf_scalar(val: object) -> bool: ... +def isneginf_scalar(val: object) -> bool: ... +def checknull(val: object, inf_as_na: bool = ...) -> bool: ... +def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ... +def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi index 11ae3b852e97ab..74a6ad87cd279a 100644 --- a/pandas/_libs/ops.pyi +++ b/pandas/_libs/ops.pyi @@ -1,6 +1,7 @@ from typing import ( Any, Callable, + Iterable, Literal, overload, ) @@ -35,15 +36,15 @@ def vec_binop( @overload def maybe_convert_bool( arr: npt.NDArray[np.object_], - true_values=..., - false_values=..., + true_values: Iterable = ..., + false_values: Iterable = ..., convert_to_masked_nullable: Literal[False] = ..., ) -> tuple[np.ndarray, None]: ... @overload def maybe_convert_bool( arr: npt.NDArray[np.object_], - true_values=..., - false_values=..., + true_values: Iterable = ..., + false_values: Iterable = ..., *, convert_to_masked_nullable: Literal[True], ) -> tuple[np.ndarray, np.ndarray]: ... diff --git a/pandas/_libs/ops_dispatch.pyx b/pandas/_libs/ops_dispatch.pyx index c34504732ac32d..2b2a411e6635f2 100644 --- a/pandas/_libs/ops_dispatch.pyx +++ b/pandas/_libs/ops_dispatch.pyx @@ -34,7 +34,7 @@ UFUNC_ALIASES = { "true_divide": "truediv", "power": "pow", "remainder": "mod", - "divide": "div", + "divide": "truediv", "equal": "eq", "not_equal": "ne", "less": "lt", diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index c76bfab51aacd8..4f80936359263d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -558,18 +558,11 @@ cdef class TextReader: pass def __dealloc__(self): - self.close() + _close(self) parser_del(self.parser) - def close(self) -> None: - # also preemptively free all allocated memory - parser_free(self.parser) - if self.true_set: - kh_destroy_str_starts(self.true_set) - self.true_set = NULL - if self.false_set: - kh_destroy_str_starts(self.false_set) - self.false_set = NULL + def close(self): + _close(self) def _set_quoting(self, quote_char: str | bytes | None, quoting: int): if not isinstance(quoting, int): @@ -1093,8 +1086,27 @@ cdef class TextReader: break # we had a fallback parse on the dtype, so now try to cast - # only allow safe casts, eg. with a nan you cannot safely cast to int if col_res is not None and col_dtype is not None: + # If col_res is bool, it might actually be a bool array mixed with NaNs + # (see _try_bool_flex()). Usually this would be taken care of using + # _maybe_upcast(), but if col_dtype is a floating type we should just + # take care of that cast here. + if col_res.dtype == np.bool_ and is_float_dtype(col_dtype): + mask = col_res.view(np.uint8) == na_values[np.uint8] + col_res = col_res.astype(col_dtype) + np.putmask(col_res, mask, np.nan) + return col_res, na_count + + # NaNs are already cast to True here, so can not use astype + if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype): + if na_count > 0: + raise ValueError( + f"cannot safely convert passed user dtype of " + f"{col_dtype} for {np.bool_} dtyped data in " + f"column {i} due to NA values" + ) + + # only allow safe casts, eg. with a nan you cannot safely cast to int try: col_res = col_res.astype(col_dtype, casting='safe') except TypeError: @@ -1292,6 +1304,21 @@ cdef class TextReader: return None +# Factor out code common to TextReader.__dealloc__ and TextReader.close +# It cannot be a class method, since calling self.close() in __dealloc__ +# which causes a class attribute lookup and violates best parctices +# https://cython.readthedocs.io/en/latest/src/userguide/special_methods.html#finalization-method-dealloc +cdef _close(TextReader reader): + # also preemptively free all allocated memory + parser_free(reader.parser) + if reader.true_set: + kh_destroy_str_starts(reader.true_set) + reader.true_set = NULL + if reader.false_set: + kh_destroy_str_starts(reader.false_set) + reader.false_set = NULL + + cdef: object _true_values = [b'True', b'TRUE', b'true'] object _false_values = [b'False', b'FALSE', b'false'] diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in index c6e65f8b961871..e6a2c7b1b050a0 100644 --- a/pandas/_libs/sparse_op_helper.pxi.in +++ b/pandas/_libs/sparse_op_helper.pxi.in @@ -42,6 +42,11 @@ cdef inline sparse_t __mod__(sparse_t a, sparse_t b): cdef inline sparse_t __floordiv__(sparse_t a, sparse_t b): if b == 0: if sparse_t is float64_t: + # Match non-sparse Series behavior implemented in mask_zero_div_zero + if a > 0: + return INF + elif a < 0: + return -INF return NaN else: return 0 diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 757cabdbbc730e..5b5995a671b2c3 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -297,6 +297,7 @@ typedef struct __JSONObjectDecoder { JSOBJ (*endArray)(void *prv, JSOBJ obj); JSOBJ (*newInt)(void *prv, JSINT32 value); JSOBJ (*newLong)(void *prv, JSINT64 value); + JSOBJ (*newUnsignedLong)(void *prv, JSUINT64 value); JSOBJ (*newDouble)(void *prv, double value); void (*releaseObject)(void *prv, JSOBJ obj, void *decoder); JSPFN_MALLOC malloc; diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 81327fd9efb06a..fee552672b8b6a 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -116,8 +116,8 @@ JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { int intNeg = 1; - int mantSize = 0; JSUINT64 intValue; + JSUINT64 prevIntValue; int chr; int decimalCount = 0; double frcValue = 0.0; @@ -134,10 +134,10 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { } else if (*(offset) == '-') { offset++; intNeg = -1; + overflowLimit = LLONG_MIN; if (*(offset) == 'I') { goto DECODE_INF; } - overflowLimit = LLONG_MIN; } // Scan integer part @@ -157,19 +157,18 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { case '7': case '8': case '9': { - // FIXME: Check for arithmetic overflow here - // PERF: Don't do 64-bit arithmetic here unless we know we have - // to - intValue = intValue * 10ULL + (JSLONG)(chr - 48); - - if (intValue > overflowLimit) { - return SetError(ds, -1, overflowLimit == LLONG_MAX - ? "Value is too big" - : "Value is too small"); + // PERF: Don't do 64-bit arithmetic here unless we have to + prevIntValue = intValue; + intValue = intValue * 10ULL + (JSLONG) (chr - 48); + + if (intNeg == 1 && prevIntValue > intValue) { + return SetError(ds, -1, "Value is too big!"); + } else if (intNeg == -1 && intValue > overflowLimit) { + return SetError(ds, -1, overflowLimit == LLONG_MAX ? + "Value is too big!" : "Value is too small"); } offset++; - mantSize++; break; } case '.': { @@ -196,11 +195,12 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { ds->lastType = JT_INT; ds->start = offset; - if ((intValue >> 31)) { + if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) + return ds->dec->newUnsignedLong(ds->prv, intValue); + else if ((intValue >> 31)) return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); - } else { + else return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); - } DECODE_FRACTION: diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index 3db10237b26887..14683f4c28cbe8 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -479,6 +479,10 @@ JSOBJ Object_newLong(void *prv, JSINT64 value) { return PyLong_FromLongLong(value); } +JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) { + return PyLong_FromUnsignedLongLong(value); +} + JSOBJ Object_newDouble(void *prv, double value) { return PyFloat_FromDouble(value); } @@ -508,7 +512,8 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { Object_newTrue, Object_newFalse, Object_newNull, Object_newPosInf, Object_newNegInf, Object_newObject, Object_endObject, Object_newArray, Object_endArray, - Object_newInteger, Object_newLong, Object_newDouble, + Object_newInteger, Object_newLong, Object_newUnsignedLong, + Object_newDouble, Object_releaseObject, PyObject_Malloc, PyObject_Free, PyObject_Realloc}; diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index 8c510b05de4ce4..8e47993e9d85f6 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -18,33 +18,33 @@ class PeriodDtypeBase: def resolution(self) -> Resolution: ... class FreqGroup(Enum): - FR_ANN: int = ... - FR_QTR: int = ... - FR_MTH: int = ... - FR_WK: int = ... - FR_BUS: int = ... - FR_DAY: int = ... - FR_HR: int = ... - FR_MIN: int = ... - FR_SEC: int = ... - FR_MS: int = ... - FR_US: int = ... - FR_NS: int = ... - FR_UND: int = ... + FR_ANN: int + FR_QTR: int + FR_MTH: int + FR_WK: int + FR_BUS: int + FR_DAY: int + FR_HR: int + FR_MIN: int + FR_SEC: int + FR_MS: int + FR_US: int + FR_NS: int + FR_UND: int @staticmethod def get_freq_group(code: int) -> FreqGroup: ... class Resolution(Enum): - RESO_NS: int = ... - RESO_US: int = ... - RESO_MS: int = ... - RESO_SEC: int = ... - RESO_MIN: int = ... - RESO_HR: int = ... - RESO_DAY: int = ... - RESO_MTH: int = ... - RESO_QTR: int = ... - RESO_YR: int = ... + RESO_NS: int + RESO_US: int + RESO_MS: int + RESO_SEC: int + RESO_MIN: int + RESO_HR: int + RESO_DAY: int + RESO_MTH: int + RESO_QTR: int + RESO_YR: int def __lt__(self, other: Resolution) -> bool: ... def __ge__(self, other: Resolution) -> bool: ... @property diff --git a/pandas/_libs/tslibs/fields.pyi b/pandas/_libs/tslibs/fields.pyi index cbf91f2bcaf762..415b4329310c02 100644 --- a/pandas/_libs/tslibs/fields.pyi +++ b/pandas/_libs/tslibs/fields.pyi @@ -9,7 +9,7 @@ def month_position_check(fields, weekdays) -> str | None: ... def get_date_name_field( dtindex: npt.NDArray[np.int64], # const int64_t[:] field: str, - locale=..., + locale: str | None = ..., ) -> npt.NDArray[np.object_]: ... def get_start_end_field( dtindex: npt.NDArray[np.int64], # const int64_t[:] @@ -31,7 +31,7 @@ def isleapyear_arr( def build_isocalendar_sarray( dtindex: npt.NDArray[np.int64], # const int64_t[:] ) -> np.ndarray: ... -def get_locale_names(name_type: str, locale: object = ...): ... +def get_locale_names(name_type: str, locale: str | None = ...): ... class RoundTo: @property diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index 6a5555cfff0303..8b409935b8fb8a 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -1,6 +1,7 @@ from datetime import ( datetime, timedelta, + tzinfo as _tzinfo, ) from typing import Any @@ -12,12 +13,14 @@ NaT: NaTType iNaT: int nat_strings: set[str] +def is_null_datetimelike(val: object, inat_is_null: bool = ...) -> bool: ... + class NaTType(datetime): value: np.int64 def asm8(self) -> np.datetime64: ... def to_datetime64(self) -> np.datetime64: ... def to_numpy( - self, dtype=..., copy: bool = ... + self, dtype: np.dtype | str | None = ..., copy: bool = ... ) -> np.datetime64 | np.timedelta64: ... @property def is_leap_year(self) -> bool: ... @@ -69,7 +72,20 @@ class NaTType(datetime): def ceil(self) -> NaTType: ... def tz_convert(self) -> NaTType: ... def tz_localize(self) -> NaTType: ... - def replace(self, *args, **kwargs) -> NaTType: ... + # error: Signature of "replace" incompatible with supertype "datetime" + def replace( # type: ignore[override] + self, + year: int | None = ..., + month: int | None = ..., + day: int | None = ..., + hour: int | None = ..., + minute: int | None = ..., + second: int | None = ..., + microsecond: int | None = ..., + nanosecond: int | None = ..., + tzinfo: _tzinfo | None = ..., + fold: int | None = ..., + ) -> NaTType: ... # error: Return type "float" of "year" incompatible with return # type "int" in supertype "date" @property diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi new file mode 100644 index 00000000000000..db0c277b73bd5f --- /dev/null +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -0,0 +1 @@ +class OutOfBoundsDatetime(ValueError): ... diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index f293557a51ac25..6df4abc160b0bc 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -186,8 +186,9 @@ def apply_wraps(func): if self.normalize: result = result.normalize() - # nanosecond may be deleted depending on offset process - if not self.normalize and nano != 0: + # If the offset object does not have a nanoseconds component, + # the result's nanosecond component may be lost. + if not self.normalize and nano != 0 and not hasattr(self, "nanoseconds"): if result.nanosecond != nano: if result.tz is not None: # convert to UTC @@ -333,7 +334,7 @@ cdef _determine_offset(kwds): # sub-daily offset - use timedelta (tz-aware) offset = timedelta(**kwds_no_nanos) else: - offset = timedelta(1) + offset = timedelta(0) return offset, use_relativedelta @@ -1068,12 +1069,17 @@ cdef class RelativeDeltaOffset(BaseOffset): # perform calculation in UTC other = other.replace(tzinfo=None) + if hasattr(self, "nanoseconds"): + td_nano = Timedelta(nanoseconds=self.nanoseconds) + else: + td_nano = Timedelta(0) + if self.n > 0: for i in range(self.n): - other = other + self._offset + other = other + self._offset + td_nano else: for i in range(-self.n): - other = other - self._offset + other = other - self._offset - td_nano if tzinfo is not None and self._use_relativedelta: # bring tz back from UTC calculation @@ -3573,7 +3579,7 @@ cpdef to_offset(freq): Parameters ---------- - freq : str, tuple, datetime.timedelta, DateOffset or None + freq : str, datetime.timedelta, BaseOffset or None Returns ------- @@ -3586,7 +3592,7 @@ cpdef to_offset(freq): See Also -------- - DateOffset : Standard kind of date increment used for a date range. + BaseOffset : Standard kind of date increment used for a date range. Examples -------- diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 4f7505fd7e792e..2f60df0ad888ea 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -59,22 +59,22 @@ class Period: def __new__( # type: ignore[misc] cls, value=..., - freq=..., - ordinal=..., - year=..., - month=..., - quarter=..., - day=..., - hour=..., - minute=..., - second=..., + freq: int | str | None = ..., + ordinal: int | None = ..., + year: int | None = ..., + month: int | None = ..., + quarter: int | None = ..., + day: int | None = ..., + hour: int | None = ..., + minute: int | None = ..., + second: int | None = ..., ) -> Period | NaTType: ... @classmethod def _maybe_convert_freq(cls, freq) -> BaseOffset: ... @classmethod def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod - def now(cls, freq=...) -> Period: ... + def now(cls, freq: BaseOffset = ...) -> Period: ... def strftime(self, fmt: str) -> str: ... def to_timestamp( self, @@ -82,7 +82,7 @@ class Period: how: str = ..., tz: Timezone | None = ..., ) -> Timestamp: ... - def asfreq(self, freq, how=...) -> Period: ... + def asfreq(self, freq: str, how: str = ...) -> Period: ... @property def freqstr(self) -> str: ... @property diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 67696f9740ea19..1df1c9a947e8d1 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1088,6 +1088,7 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): """ cdef: Py_ssize_t n = len(arr) + Py_ssize_t increment = arr.strides[0] // 8 ndarray[int64_t] result = np.empty(n, dtype=np.int64) _period_asfreq( @@ -1097,6 +1098,7 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): freq1, freq2, end, + increment, ) return result @@ -1110,6 +1112,7 @@ cdef void _period_asfreq( int freq1, int freq2, bint end, + Py_ssize_t increment=1, ): """See period_asfreq.__doc__""" cdef: @@ -1127,7 +1130,7 @@ cdef void _period_asfreq( get_asfreq_info(freq1, freq2, end, &af_info) for i in range(length): - val = ordinals[i] + val = ordinals[i * increment] if val != NPY_NAT: val = func(val, &af_info) out[i] = val diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 7c0131cf28c9ac..d8369f0cc90f95 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -14,7 +14,7 @@ from pandas._libs.tslibs import ( ) from pandas._typing import npt -_S = TypeVar("_S") +_S = TypeVar("_S", bound=timedelta) def ints_to_pytimedelta( arr: npt.NDArray[np.int64], # const int64_t[:] @@ -36,7 +36,10 @@ class Timedelta(timedelta): # error: "__new__" must return a class instance (got "Union[Timedelta, NaTType]") def __new__( # type: ignore[misc] - cls: Type[_S], value=..., unit=..., **kwargs + cls: Type[_S], + value=..., + unit: str = ..., + **kwargs: int | float | np.integer | np.floating, ) -> _S | NaTType: ... @property def days(self) -> int: ... @@ -50,9 +53,9 @@ class Timedelta(timedelta): @property def asm8(self) -> np.timedelta64: ... # TODO: round/floor/ceil could return NaT? - def round(self: _S, freq) -> _S: ... - def floor(self: _S, freq) -> _S: ... - def ceil(self: _S, freq) -> _S: ... + def round(self: _S, freq: str) -> _S: ... + def floor(self: _S, freq: str) -> _S: ... + def ceil(self: _S, freq: str) -> _S: ... @property def resolution_string(self) -> str: ... def __add__(self, other: timedelta) -> timedelta: ... diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index a89d0aecfc26c6..ecddd83322bbf8 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -8,7 +8,6 @@ from datetime import ( from time import struct_time from typing import ( ClassVar, - Type, TypeVar, overload, ) @@ -17,15 +16,14 @@ import numpy as np from pandas._libs.tslibs import ( BaseOffset, - NaT, NaTType, Period, Timedelta, ) -_S = TypeVar("_S") +_DatetimeT = TypeVar("_DatetimeT", bound=datetime) -def integer_op_not_supported(obj) -> None: ... +def integer_op_not_supported(obj: object) -> TypeError: ... class Timestamp(datetime): min: ClassVar[Timestamp] @@ -36,7 +34,7 @@ class Timestamp(datetime): # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]") def __new__( # type: ignore[misc] - cls: Type[_S], + cls: type[_DatetimeT], ts_input: int | np.integer | float @@ -44,9 +42,9 @@ class Timestamp(datetime): | _date | datetime | np.datetime64 = ..., - freq=..., + freq: int | None | str | BaseOffset = ..., tz: str | _tzinfo | None | int = ..., - unit=..., + unit: str | int | None = ..., year: int | None = ..., month: int | None = ..., day: int | None = ..., @@ -58,7 +56,7 @@ class Timestamp(datetime): tzinfo: _tzinfo | None = ..., *, fold: int | None = ..., - ) -> _S | NaTType: ... + ) -> _DatetimeT | NaTType: ... def _set_freq(self, freq: BaseOffset | None) -> None: ... @property def year(self) -> int: ... @@ -81,24 +79,30 @@ class Timestamp(datetime): @property def fold(self) -> int: ... @classmethod - def fromtimestamp(cls: Type[_S], t: float, tz: _tzinfo | None = ...) -> _S: ... + def fromtimestamp( + cls: type[_DatetimeT], t: float, tz: _tzinfo | None = ... + ) -> _DatetimeT: ... @classmethod - def utcfromtimestamp(cls: Type[_S], t: float) -> _S: ... + def utcfromtimestamp(cls: type[_DatetimeT], t: float) -> _DatetimeT: ... @classmethod - def today(cls: Type[_S]) -> _S: ... + def today(cls: type[_DatetimeT], tz: _tzinfo | str | None = ...) -> _DatetimeT: ... @classmethod - def fromordinal(cls: Type[_S], n: int) -> _S: ... + def fromordinal( + cls: type[_DatetimeT], + ordinal: int, + freq: str | BaseOffset | None = ..., + tz: _tzinfo | str | None = ..., + ) -> _DatetimeT: ... @classmethod - def now(cls: Type[_S], tz: _tzinfo | str | None = ...) -> _S: ... + def now(cls: type[_DatetimeT], tz: _tzinfo | str | None = ...) -> _DatetimeT: ... @classmethod - def utcnow(cls: Type[_S]) -> _S: ... + def utcnow(cls: type[_DatetimeT]) -> _DatetimeT: ... + # error: Signature of "combine" incompatible with supertype "datetime" @classmethod - def combine( - cls, date: _date, time: _time, tzinfo: _tzinfo | None = ... - ) -> datetime: ... + def combine(cls, date: _date, time: _time) -> datetime: ... # type: ignore[override] @classmethod - def fromisoformat(cls: Type[_S], date_string: str) -> _S: ... - def strftime(self, fmt: str) -> str: ... + def fromisoformat(cls: type[_DatetimeT], date_string: str) -> _DatetimeT: ... + def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... def timetuple(self) -> struct_time: ... @@ -117,10 +121,9 @@ class Timestamp(datetime): second: int = ..., microsecond: int = ..., tzinfo: _tzinfo | None = ..., - *, fold: int = ..., ) -> datetime: ... - def astimezone(self: _S, tz: _tzinfo | None = ...) -> _S: ... + def astimezone(self: _DatetimeT, tz: _tzinfo | None = ...) -> _DatetimeT: ... def ctime(self) -> str: ... def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ... @classmethod @@ -132,12 +135,18 @@ class Timestamp(datetime): def __lt__(self, other: datetime) -> bool: ... # type: ignore def __ge__(self, other: datetime) -> bool: ... # type: ignore def __gt__(self, other: datetime) -> bool: ... # type: ignore - def __add__(self: _S, other: timedelta) -> _S: ... - def __radd__(self: _S, other: timedelta) -> _S: ... + # error: Signature of "__add__" incompatible with supertype "date"/"datetime" + @overload # type: ignore[override] + def __add__(self, other: np.ndarray) -> np.ndarray: ... + @overload + # TODO: other can also be Tick (but it cannot be resolved) + def __add__(self: _DatetimeT, other: timedelta | np.timedelta64) -> _DatetimeT: ... + def __radd__(self: _DatetimeT, other: timedelta) -> _DatetimeT: ... @overload # type: ignore def __sub__(self, other: datetime) -> timedelta: ... @overload - def __sub__(self, other: timedelta) -> datetime: ... + # TODO: other can also be Tick (but it cannot be resolved) + def __sub__(self, other: timedelta | np.timedelta64) -> datetime: ... def __hash__(self) -> int: ... def weekday(self) -> int: ... def isoweekday(self) -> int: ... @@ -158,23 +167,41 @@ class Timestamp(datetime): def is_year_end(self) -> bool: ... def to_pydatetime(self, warn: bool = ...) -> datetime: ... def to_datetime64(self) -> np.datetime64: ... - def to_period(self, freq) -> Period: ... + def to_period(self, freq: BaseOffset | str | None = ...) -> Period: ... def to_julian_date(self) -> np.float64: ... @property def asm8(self) -> np.datetime64: ... - def tz_convert(self: _S, tz) -> _S: ... + def tz_convert(self: _DatetimeT, tz: _tzinfo | str | None) -> _DatetimeT: ... # TODO: could return NaT? def tz_localize( - self: _S, tz, ambiguous: str = ..., nonexistent: str = ... - ) -> _S: ... - def normalize(self: _S) -> _S: ... + self: _DatetimeT, + tz: _tzinfo | str | None, + ambiguous: str = ..., + nonexistent: str = ..., + ) -> _DatetimeT: ... + def normalize(self: _DatetimeT) -> _DatetimeT: ... # TODO: round/floor/ceil could return NaT? def round( - self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ... - ) -> _S: ... + self: _DatetimeT, freq: str, ambiguous: bool | str = ..., nonexistent: str = ... + ) -> _DatetimeT: ... def floor( - self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ... - ) -> _S: ... + self: _DatetimeT, freq: str, ambiguous: bool | str = ..., nonexistent: str = ... + ) -> _DatetimeT: ... def ceil( - self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ... - ) -> _S: ... + self: _DatetimeT, freq: str, ambiguous: bool | str = ..., nonexistent: str = ... + ) -> _DatetimeT: ... + def day_name(self, locale: str | None = ...) -> str: ... + def month_name(self, locale: str | None = ...) -> str: ... + @property + def day_of_week(self) -> int: ... + @property + def day_of_month(self) -> int: ... + @property + def day_of_year(self) -> int: ... + @property + def quarter(self) -> int: ... + @property + def week(self) -> int: ... + def to_numpy( + self, dtype: np.dtype | None = ..., copy: bool = ... + ) -> np.datetime64: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index f491b5aeedadc2..1c26793876e5a8 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -307,7 +307,6 @@ cdef class _Timestamp(ABCTimestamp): elif not isinstance(self, _Timestamp): # cython semantics, args have been switched and this is __radd__ return other.__add__(self) - return NotImplemented def __sub__(self, other): diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 62481544222525..66503ac81b4d17 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -28,13 +28,9 @@ from pandas._typing import Dtype from pandas.core.dtypes.common import ( - is_datetime64_dtype, - is_datetime64tz_dtype, is_float_dtype, is_integer_dtype, - is_period_dtype, is_sequence, - is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype, ) @@ -93,7 +89,10 @@ assert_timedelta_array_equal, raise_assert_detail, ) -from pandas._testing.compat import get_dtype # noqa:F401 +from pandas._testing.compat import ( # noqa:F401 + get_dtype, + get_obj, +) from pandas._testing.contexts import ( # noqa:F401 RNGContext, decompress_file, @@ -111,14 +110,11 @@ ) from pandas.core.arrays import ( BaseMaskedArray, - DatetimeArray, ExtensionArray, PandasArray, - PeriodArray, - TimedeltaArray, - period_array, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.construction import extract_array if TYPE_CHECKING: from pandas import ( @@ -160,6 +156,17 @@ + BYTES_DTYPES ) +NARROW_NP_DTYPES = [ + np.float16, + np.float32, + np.int8, + np.int16, + np.int32, + np.uint8, + np.uint16, + np.uint32, +] + NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA, Decimal("NaN")] NP_NAT_OBJECTS = [ cls("NaT", unit) @@ -256,13 +263,6 @@ def box_expected(expected, box_cls, transpose=True): # single-row special cases in datetime arithmetic expected = expected.T expected = pd.concat([expected] * 2, ignore_index=True) - elif box_cls is PeriodArray: - # the PeriodArray constructor is not as flexible as period_array - expected = period_array(expected) - elif box_cls is DatetimeArray: - expected = DatetimeArray(expected) - elif box_cls is TimedeltaArray: - expected = TimedeltaArray(expected) elif box_cls is np.ndarray or box_cls is np.array: expected = np.array(expected) elif box_cls is to_array: @@ -273,17 +273,16 @@ def box_expected(expected, box_cls, transpose=True): def to_array(obj): + """ + Similar to pd.array, but does not cast numpy dtypes to nullable dtypes. + """ # temporary implementation until we get pd.array in place dtype = getattr(obj, "dtype", None) - if is_period_dtype(dtype): - return period_array(obj) - elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): - return DatetimeArray._from_sequence(obj) - elif is_timedelta64_dtype(dtype): - return TimedeltaArray._from_sequence(obj) - else: - return np.array(obj) + if dtype is None: + return np.asarray(obj) + + return extract_array(obj, extract_numpy=True) # ----------------------------------------------------------------------------- diff --git a/pandas/_testing/_hypothesis.py b/pandas/_testing/_hypothesis.py index 0e506f5e878b4e..5256a303de34e7 100644 --- a/pandas/_testing/_hypothesis.py +++ b/pandas/_testing/_hypothesis.py @@ -44,6 +44,10 @@ min_size=3, ) +OPTIONAL_ONE_OF_ALL = st.one_of( + OPTIONAL_DICTS, OPTIONAL_FLOATS, OPTIONAL_INTS, OPTIONAL_LISTS, OPTIONAL_TEXT +) + if is_platform_windows(): DATETIME_NO_TZ = st.datetimes(min_value=datetime(1900, 1, 1)) else: diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index f66614bd02a3f0..b78cfd3fb39fb2 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -147,12 +147,16 @@ def _assert_caught_no_extra_warnings( for actual_warning in caught_warnings: if _is_unexpected_warning(actual_warning, expected_warning): - unclosed = "unclosed transport bool: # compression keywords and compression CompressionDict = Dict[str, Any] -CompressionOptions = Optional[Union[str, CompressionDict]] +CompressionOptions = Optional[ + Union[Literal["infer", "gzip", "bz2", "zip", "xz"], CompressionDict] +] # types in DataFrameFormatter diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index de6b4c9fc6e4ac..f9b16419917f2d 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -13,8 +13,6 @@ from pandas._typing import F from pandas.compat.numpy import ( is_numpy_dev, - np_array_datetime64_compat, - np_datetime64_compat, np_version_under1p19, np_version_under1p20, ) @@ -130,8 +128,6 @@ def get_lzma_file(): __all__ = [ "is_numpy_dev", - "np_array_datetime64_compat", - "np_datetime64_compat", "np_version_under1p19", "np_version_under1p20", "pa_version_under1p01", diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 2792a756bf20c2..97af4dee3c0dee 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -1,7 +1,4 @@ """ support numpy compatibility across versions """ - -import re - import numpy as np from pandas.util.version import Version @@ -29,44 +26,6 @@ ) -_tz_regex = re.compile("[+-]0000$") - - -def _tz_replacer(tstring): - if isinstance(tstring, str): - if tstring.endswith("Z"): - tstring = tstring[:-1] - elif _tz_regex.search(tstring): - tstring = tstring[:-5] - return tstring - - -def np_datetime64_compat(tstring: str, unit: str = "ns"): - """ - provide compat for construction of strings to numpy datetime64's with - tz-changes in 1.11 that make '2015-01-01 09:00:00Z' show a deprecation - warning, when need to pass '2015-01-01 09:00:00' - """ - tstring = _tz_replacer(tstring) - return np.datetime64(tstring, unit) - - -def np_array_datetime64_compat(arr, dtype="M8[ns]"): - """ - provide compat for construction of an array of strings to a - np.array(..., dtype=np.datetime64(..)) - tz-changes in 1.11 that make '2015-01-01 09:00:00Z' show a deprecation - warning, when need to pass '2015-01-01 09:00:00' - """ - # is_list_like; can't import as it would be circular - if hasattr(arr, "__iter__") and not isinstance(arr, (str, bytes)): - arr = [_tz_replacer(s) for s in arr] - else: - arr = _tz_replacer(arr) - - return np.array(arr, dtype=dtype) - - __all__ = [ "np", "_np_version", diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index ca539eefd3aeec..61d3b2ef079ac7 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -35,9 +35,6 @@ def load_reduce(self): args = stack.pop() func = stack[-1] - if len(args) and type(args[0]) is type: - n = args[0].__name__ # noqa - try: stack[-1] = func(*args) return diff --git a/pandas/conftest.py b/pandas/conftest.py index eb9a952250f369..8c870bc98b5ff4 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -559,6 +559,21 @@ def index_flat(request): index_flat2 = index_flat +@pytest.fixture( + params=[ + key + for key in indices_dict + if not isinstance(indices_dict[key], MultiIndex) and indices_dict[key].is_unique + ] +) +def index_flat_unique(request): + """ + index_flat with uniqueness requirement. + """ + key = request.param + return indices_dict[key].copy() + + @pytest.fixture( params=[ key @@ -599,11 +614,6 @@ def index_with_missing(request): # ---------------------------------------------------------------- # Series' # ---------------------------------------------------------------- -@pytest.fixture -def empty_series(): - return Series([], index=[], dtype=np.float64) - - @pytest.fixture def string_series(): """ @@ -672,31 +682,12 @@ def series_with_multilevel_index(): return ser -_narrow_dtypes = [ - np.float16, - np.float32, - np.int8, - np.int16, - np.int32, - np.uint8, - np.uint16, - np.uint32, -] _narrow_series = { f"{dtype.__name__}-series": tm.makeFloatSeries(name="a").astype(dtype) - for dtype in _narrow_dtypes + for dtype in tm.NARROW_NP_DTYPES } -@pytest.fixture(params=_narrow_series.keys()) -def narrow_series(request): - """ - Fixture for Series with low precision data types - """ - # copy to avoid mutation, e.g. setting .name - return _narrow_series[request.param].copy() - - _index_or_series_objs = {**indices_dict, **_series, **_narrow_series} @@ -712,11 +703,6 @@ def index_or_series_obj(request): # ---------------------------------------------------------------- # DataFrames # ---------------------------------------------------------------- -@pytest.fixture -def empty_frame(): - return DataFrame() - - @pytest.fixture def int_frame(): """ diff --git a/pandas/core/api.py b/pandas/core/api.py index a03293ce131449..cf082d2013d3be 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -1,4 +1,4 @@ -# flake8: noqa +# flake8: noqa:F401 from pandas._libs import ( NaT, diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 4d1fb8f33e5ad3..e26bb9fb6ebadb 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -80,7 +80,8 @@ def _check_comparison_types( f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" ) - if not regex: + if not regex or not should_use_regex(regex, b): + # TODO: should use missing.mask_missing? op = lambda x: operator.eq(x, b) else: op = np.vectorize( diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index c496099e3a8d23..9a646ddc6ca7e7 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -326,13 +326,16 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) reconstruct_kwargs = {} def reconstruct(result): + if ufunc.nout > 1: + # np.modf, np.frexp, np.divmod + return tuple(_reconstruct(x) for x in result) + + return _reconstruct(result) + + def _reconstruct(result): if lib.is_scalar(result): return result - if isinstance(result, tuple): - # np.modf, np.frexp, np.divmod - return tuple(reconstruct(x) for x in result) - if result.ndim != self.ndim: if method == "outer": if self.ndim == 2: @@ -367,10 +370,12 @@ def reconstruct(result): return result if "out" in kwargs: + # e.g. test_multiindex_get_loc result = dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs) return reconstruct(result) if method == "reduce": + # e.g. test.series.test_ufunc.test_reduce result = dispatch_reduction_ufunc(self, ufunc, method, *inputs, **kwargs) if result is not NotImplemented: return result diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 09cdb985ddb2e7..d501af6212ce3f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -23,7 +23,6 @@ from pandas.core.dtypes.common import ( is_bool_dtype, - is_float, is_float_dtype, is_integer_dtype, is_list_like, @@ -532,35 +531,5 @@ def _arith_method(self, other, op): return self._maybe_mask_result(result, mask, other, op_name) - def _maybe_mask_result(self, result, mask, other, op_name: str): - """ - Parameters - ---------- - result : array-like - mask : array-like bool - other : scalar or array-like - op_name : str - """ - # if we have a float operand we are by-definition - # a float result - # or our op is a divide - if (is_float_dtype(other) or is_float(other)) or ( - op_name in ["rtruediv", "truediv"] - ): - from pandas.core.arrays import FloatingArray - - return FloatingArray(result, mask, copy=False) - - elif is_bool_dtype(result): - return BooleanArray(result, mask, copy=False) - - elif is_integer_dtype(result): - from pandas.core.arrays import IntegerArray - - return IntegerArray(result, mask, copy=False) - else: - result[mask] = np.nan - return result - def __abs__(self): return self.copy() diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 67dc6ade252547..f9d066f1e694d3 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -422,7 +422,14 @@ def __init__( # We remove null values here, then below will re-insert # them, grep "full_codes" arr_list = [values[idx] for idx in np.where(~null_mask)[0]] - arr = sanitize_array(arr_list, None) + + # GH#44900 Do not cast to float if we have only missing values + if arr_list or arr.dtype == "object": + sanitize_dtype = None + else: + sanitize_dtype = arr.dtype + + arr = sanitize_array(arr_list, None, dtype=sanitize_dtype) values = arr if dtype.categories is None: @@ -2377,7 +2384,7 @@ def describe(self): return result - def isin(self, values) -> np.ndarray: + def isin(self, values) -> npt.NDArray[np.bool_]: """ Check whether `values` are contained in Categorical. @@ -2394,7 +2401,7 @@ def isin(self, values) -> np.ndarray: Returns ------- - isin : numpy.ndarray (bool dtype) + np.ndarray[bool] Raises ------ @@ -2457,6 +2464,16 @@ def replace(self, to_replace, value, inplace: bool = False): [3, 2, 3, 3] Categories (2, int64): [2, 3] """ + # GH#44929 deprecation + warn( + "Categorical.replace is deprecated and will be removed in a future " + "version. Use Series.replace directly instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._replace(to_replace=to_replace, value=value, inplace=inplace) + + def _replace(self, *, to_replace, value, inplace: bool = False): inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1b9759c451765f..6fcba997736072 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -291,7 +291,7 @@ def asi8(self) -> npt.NDArray[np.int64]: # ---------------------------------------------------------------- # Rendering Methods - def _format_native_types(self, na_rep="NaT", date_format=None): + def _format_native_types(self, *, na_rep="NaT", date_format=None): """ Helper method for astype when converting to strings. @@ -338,7 +338,6 @@ def __getitem__( result = cast( "Union[DatetimeLikeArrayT, DTScalarOrNaT]", super().__getitem__(key) ) - result = super().__getitem__(key) if lib.is_scalar(result): return result else: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7bd3403abd5cc7..b3a1a4d3423552 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -670,7 +670,7 @@ def astype(self, dtype, copy: bool = True): @dtl.ravel_compat def _format_native_types( - self, na_rep="NaT", date_format=None, **kwargs + self, *, na_rep="NaT", date_format=None, **kwargs ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_datetime64_from_values diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 396ed7eb4abebc..4c868747fa9300 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -14,7 +14,6 @@ DtypeObj, npt, ) -from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import astype_nansafe @@ -338,43 +337,6 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: def _values_for_argsort(self) -> np.ndarray: return self._data - def sum(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs): - nv.validate_sum((), kwargs) - return super()._reduce("sum", skipna=skipna, min_count=min_count, axis=axis) - - def prod(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs): - nv.validate_prod((), kwargs) - return super()._reduce("prod", skipna=skipna, min_count=min_count, axis=axis) - - def min(self, *, skipna=True, axis: int | None = 0, **kwargs): - nv.validate_min((), kwargs) - return super()._reduce("min", skipna=skipna, axis=axis) - - def max(self, *, skipna=True, axis: int | None = 0, **kwargs): - nv.validate_max((), kwargs) - return super()._reduce("max", skipna=skipna, axis=axis) - - def _maybe_mask_result(self, result, mask, other, op_name: str): - """ - Parameters - ---------- - result : array-like - mask : array-like bool - other : scalar or array-like - op_name : str - """ - # TODO are there cases we don't end up with float? - # if we have a float operand we are by-definition - # a float result - # or our op is a divide - # if (is_float_dtype(other) or is_float(other)) or ( - # op_name in ["rtruediv", "truediv"] - # ): - # result[mask] = np.nan - # return result - - return type(self)(result, mask, copy=False) - _dtype_docstring = """ An ExtensionDtype for {dtype} data. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0e82ef731bb634..3587575503d33a 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -5,7 +5,6 @@ import numpy as np from pandas._libs import ( - iNaT, lib, missing as libmissing, ) @@ -16,7 +15,6 @@ DtypeObj, npt, ) -from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly from pandas.core.dtypes.base import ( @@ -26,7 +24,6 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, - is_float, is_float_dtype, is_integer_dtype, is_object_dtype, @@ -411,49 +408,6 @@ def _values_for_argsort(self) -> np.ndarray: data[self._mask] = data.min() - 1 return data - def sum(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs): - nv.validate_sum((), kwargs) - return super()._reduce("sum", skipna=skipna, min_count=min_count, axis=axis) - - def prod(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs): - nv.validate_prod((), kwargs) - return super()._reduce("prod", skipna=skipna, min_count=min_count, axis=axis) - - def min(self, *, skipna=True, axis: int | None = 0, **kwargs): - nv.validate_min((), kwargs) - return super()._reduce("min", skipna=skipna, axis=axis) - - def max(self, *, skipna=True, axis: int | None = 0, **kwargs): - nv.validate_max((), kwargs) - return super()._reduce("max", skipna=skipna, axis=axis) - - def _maybe_mask_result(self, result, mask, other, op_name: str): - """ - Parameters - ---------- - result : array-like - mask : array-like bool - other : scalar or array-like - op_name : str - """ - # if we have a float operand we are by-definition - # a float result - # or our op is a divide - if (is_float_dtype(other) or is_float(other)) or ( - op_name in ["rtruediv", "truediv"] - ): - from pandas.core.arrays import FloatingArray - - return FloatingArray(result, mask, copy=False) - - if result.dtype == "timedelta64[ns]": - from pandas.core.arrays import TimedeltaArray - - result[mask] = iNaT - return TimedeltaArray._simple_new(result) - - return type(self)(result, mask, copy=False) - _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2eaa7d3c1fffa6..ea6673fdaf0cf4 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -685,6 +685,13 @@ def _cmp_method(self, other, op): other = pd_array(other) elif not isinstance(other, Interval): # non-interval scalar -> no matches + if other is NA: + # GH#31882 + from pandas.core.arrays import BooleanArray + + arr = np.empty(self.shape, dtype=bool) + mask = np.ones(self.shape, dtype=bool) + return BooleanArray(arr, mask) return invalid_comparison(self, other, op) # determine the dtype of the elements we want to compare @@ -743,7 +750,8 @@ def _cmp_method(self, other, op): if obj is NA: # comparison with np.nan returns NA # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092 - result[i] = op is operator.ne + result = result.astype(object) + result[i] = NA else: raise return result @@ -800,10 +808,12 @@ def min(self, *, axis: int | None = None, skipna: bool = True): if mask.any(): if not skipna: return self._na_value - return self[~mask].min() + obj = self[~mask] + else: + obj = self - indexer = self.argsort()[0] - return self[indexer] + indexer = obj.argsort()[0] + return obj[indexer] def max(self, *, axis: int | None = None, skipna: bool = True): nv.validate_minmax_axis(axis, self.ndim) @@ -815,10 +825,12 @@ def max(self, *, axis: int | None = None, skipna: bool = True): if mask.any(): if not skipna: return self._na_value - return self[~mask].max() + obj = self[~mask] + else: + obj = self - indexer = self.argsort()[-1] - return self[indexer] + indexer = obj.argsort()[-1] + return obj[indexer] def fillna( self: IntervalArrayT, value=None, method=None, limit=None diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index a882fe5d2da217..b9500924159afd 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -12,6 +12,7 @@ import numpy as np from pandas._libs import ( + iNaT, lib, missing as libmissing, ) @@ -39,9 +40,11 @@ is_bool, is_bool_dtype, is_dtype_equal, + is_float, is_float_dtype, is_integer_dtype, is_list_like, + is_numeric_dtype, is_object_dtype, is_scalar, is_string_dtype, @@ -476,7 +479,8 @@ def reconstruct(x): return x result = getattr(ufunc, method)(*inputs2, **kwargs) - if isinstance(result, tuple): + if ufunc.nout > 1: + # e.g. np.divmod return tuple(reconstruct(x) for x in result) else: return reconstruct(result) @@ -543,6 +547,48 @@ def _cmp_method(self, other, op) -> BooleanArray: return BooleanArray(result, mask, copy=False) + def _maybe_mask_result(self, result, mask, other, op_name: str): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + if ( + (is_float_dtype(other) or is_float(other)) + or (op_name in ["rtruediv", "truediv"]) + or (is_float_dtype(self.dtype) and is_numeric_dtype(result.dtype)) + ): + from pandas.core.arrays import FloatingArray + + return FloatingArray(result, mask, copy=False) + + elif is_bool_dtype(result): + from pandas.core.arrays import BooleanArray + + return BooleanArray(result, mask, copy=False) + + elif result.dtype == "timedelta64[ns]": + # e.g. test_numeric_arr_mul_tdscalar_numexpr_path + from pandas.core.arrays import TimedeltaArray + + result[mask] = iNaT + return TimedeltaArray._simple_new(result) + + elif is_integer_dtype(result): + from pandas.core.arrays import IntegerArray + + return IntegerArray(result, mask, copy=False) + + else: + result[mask] = np.nan + return result + def isna(self) -> np.ndarray: return self._mask.copy() @@ -738,13 +784,13 @@ def _quantile( return out def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - if name in {"any", "all"}: + if name in {"any", "all", "min", "max", "sum", "prod"}: return getattr(self, name)(skipna=skipna, **kwargs) data = self._data mask = self._mask - if name in {"sum", "prod", "min", "max", "mean"}: + if name in {"mean"}: op = getattr(masked_reductions, name) result = op(data, mask, skipna=skipna, **kwargs) return result @@ -754,6 +800,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): if self._hasna: data = self.to_numpy("float64", na_value=np.nan) + # median, var, std, skew, kurt, idxmin, idxmax op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) @@ -762,6 +809,70 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return result + def _wrap_reduction_result(self, name: str, result, skipna, **kwargs): + if isinstance(result, np.ndarray): + axis = kwargs["axis"] + if skipna: + # we only retain mask for all-NA rows/columns + mask = self._mask.all(axis=axis) + else: + mask = self._mask.any(axis=axis) + + return self._maybe_mask_result(result, mask, other=None, op_name=name) + return result + + def sum(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs): + nv.validate_sum((), kwargs) + + # TODO: do this in validate_sum? + if "out" in kwargs: + # np.sum; test_floating_array_numpy_sum + if kwargs["out"] is not None: + raise NotImplementedError + kwargs.pop("out") + + result = masked_reductions.sum( + self._data, + self._mask, + skipna=skipna, + min_count=min_count, + axis=axis, + ) + return self._wrap_reduction_result( + "sum", result, skipna=skipna, axis=axis, **kwargs + ) + + def prod(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs): + nv.validate_prod((), kwargs) + result = masked_reductions.prod( + self._data, + self._mask, + skipna=skipna, + min_count=min_count, + axis=axis, + ) + return self._wrap_reduction_result( + "prod", result, skipna=skipna, axis=axis, **kwargs + ) + + def min(self, *, skipna=True, axis: int | None = 0, **kwargs): + nv.validate_min((), kwargs) + return masked_reductions.min( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ) + + def max(self, *, skipna=True, axis: int | None = 0, **kwargs): + nv.validate_max((), kwargs) + return masked_reductions.max( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ) + def any(self, *, skipna: bool = True, **kwargs): """ Return whether any element is truthy. diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index b17d40b35903b3..bdd7f8c0d3c8c3 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -14,7 +14,6 @@ missing as libmissing, ) from pandas.compat.numpy import function as nv -from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import ( is_float, @@ -80,9 +79,6 @@ class NumericArray(BaseMaskedArray): Base class for IntegerArray and FloatingArray. """ - def _maybe_mask_result(self, result, mask, other, op_name: str): - raise AbstractMethodError(self) - def _arith_method(self, other, op): op_name = op.__name__ omask = None @@ -157,18 +153,6 @@ def _arith_method(self, other, op): _HANDLED_TYPES = (np.ndarray, numbers.Number) - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - result = super()._reduce(name, skipna=skipna, **kwargs) - if isinstance(result, np.ndarray): - axis = kwargs["axis"] - if skipna: - # we only retain mask for all-NA rows/columns - mask = self._mask.all(axis=axis) - else: - mask = self._mask.any(axis=axis) - return type(self)(result, mask=mask) - return result - def __neg__(self): return type(self)(-self._data, self._mask.copy()) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 0afe204b35c682..579d77369d27c3 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -18,6 +18,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ( + arraylike, nanops, ops, ) @@ -137,22 +138,19 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # The primary modification is not boxing scalar return values # in PandasArray, since pandas' ExtensionArrays are 1-d. out = kwargs.get("out", ()) - for x in inputs + out: - # Only support operations with instances of _HANDLED_TYPES. - # Use PandasArray instead of type(self) for isinstance to - # allow subclasses that don't override __array_ufunc__ to - # handle PandasArray objects. - if not isinstance(x, self._HANDLED_TYPES + (PandasArray,)): - return NotImplemented - - if ufunc not in [np.logical_or, np.bitwise_or, np.bitwise_xor]: - # For binary ops, use our custom dunder methods - # We haven't implemented logical dunder funcs, so exclude these - # to avoid RecursionError - result = ops.maybe_dispatch_ufunc_to_dunder_op( + + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + if method == "reduce": + result = arraylike.dispatch_reduction_ufunc( self, ufunc, method, *inputs, **kwargs ) if result is not NotImplemented: + # e.g. tests.series.test_ufunc.TestNumpyReductions return result # Defer to the implementation of the ufunc on unwrapped values. @@ -163,23 +161,22 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): ) result = getattr(ufunc, method)(*inputs, **kwargs) - if type(result) is tuple and len(result): - # multiple return values - if not lib.is_scalar(result[0]): - # re-box array-like results - return tuple(type(self)(x) for x in result) - else: - # but not scalar reductions - return result + if ufunc.nout > 1: + # multiple return values; re-box array-like results + return tuple(type(self)(x) for x in result) elif method == "at": # no return value return None - else: - # one return value - if not lib.is_scalar(result): - # re-box array-like results, but not scalar reductions - result = type(self)(result) + elif method == "reduce": + if isinstance(result, np.ndarray): + # e.g. test_np_reduce_2d + return type(self)(result) + + # e.g. test_np_max_nested_tuples return result + else: + # one return value; re-box array-like results + return type(self)(result) # ------------------------------------------------------------------------ # Pandas ExtensionArray Interface diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 01018c7263f322..6112ccccb89ffb 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -632,7 +632,7 @@ def _formatter(self, boxed: bool = False): @dtl.ravel_compat def _format_native_types( - self, na_rep="NaT", date_format=None, **kwargs + self, *, na_rep="NaT", date_format=None, **kwargs ) -> np.ndarray: """ actually format my specific types diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4cad08b23d6452..538d4e7e4a7aa8 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -220,6 +220,16 @@ def _sparse_array_op( left_sp_values = left.sp_values right_sp_values = right.sp_values + if ( + name in ["floordiv", "mod"] + and (right == 0).any() + and left.dtype.kind in ["i", "u"] + ): + # Match the non-Sparse Series behavior + opname = f"sparse_{name}_float64" + left_sp_values = left_sp_values.astype("float64") + right_sp_values = right_sp_values.astype("float64") + sparse_op = getattr(splib, opname) with np.errstate(all="ignore"): @@ -232,6 +242,15 @@ def _sparse_array_op( right.fill_value, ) + if name == "divmod": + # result is a 2-tuple + # error: Incompatible return value type (got "Tuple[SparseArray, + # SparseArray]", expected "SparseArray") + return ( # type: ignore[return-value] + _wrap_result(name, result[0], index, fill[0], dtype=result_dtype), + _wrap_result(name, result[1], index, fill[1], dtype=result_dtype), + ) + if result_dtype is None: result_dtype = result.dtype @@ -1234,30 +1253,8 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): else: return self.copy() dtype = self.dtype.update_dtype(dtype) - # error: Item "ExtensionDtype" of "Union[ExtensionDtype, str, dtype[Any], - # Type[str], Type[float], Type[int], Type[complex], Type[bool], Type[object], - # None]" has no attribute "_subtype_with_str" - # error: Item "str" of "Union[ExtensionDtype, str, dtype[Any], Type[str], - # Type[float], Type[int], Type[complex], Type[bool], Type[object], None]" has no - # attribute "_subtype_with_str" - # error: Item "dtype[Any]" of "Union[ExtensionDtype, str, dtype[Any], Type[str], - # Type[float], Type[int], Type[complex], Type[bool], Type[object], None]" has no - # attribute "_subtype_with_str" - # error: Item "ABCMeta" of "Union[ExtensionDtype, str, dtype[Any], Type[str], - # Type[float], Type[int], Type[complex], Type[bool], Type[object], None]" has no - # attribute "_subtype_with_str" - # error: Item "type" of "Union[ExtensionDtype, str, dtype[Any], Type[str], - # Type[float], Type[int], Type[complex], Type[bool], Type[object], None]" has no - # attribute "_subtype_with_str" - # error: Item "None" of "Union[ExtensionDtype, str, dtype[Any], Type[str], - # Type[float], Type[int], Type[complex], Type[bool], Type[object], None]" has no - # attribute "_subtype_with_str" - subtype = pandas_dtype(dtype._subtype_with_str) # type: ignore[union-attr] - # TODO copy=False is broken for astype_nansafe with int -> float, so cannot - # passthrough copy keyword: https://github.com/pandas-dev/pandas/issues/34456 - sp_values = astype_nansafe(self.sp_values, subtype, copy=True) - if sp_values is self.sp_values and copy: - sp_values = sp_values.copy() + subtype = pandas_dtype(dtype._subtype_with_str) + sp_values = astype_nansafe(self.sp_values, subtype, copy=copy) # error: Argument 1 to "_simple_new" of "SparseArray" has incompatible type # "ExtensionArray"; expected "ndarray" @@ -1501,49 +1498,50 @@ def mean(self, axis=0, *args, **kwargs): nsparse = self.sp_index.ngaps return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) - def max(self, axis: int = 0, *args, **kwargs) -> Scalar: + def max(self, *, axis: int | None = None, skipna: bool = True): """ - Max of non-NA/null values + Max of array values, ignoring NA values if specified. Parameters ---------- axis : int, default 0 Not Used. NumPy compatibility. - *args, **kwargs - Not Used. NumPy compatibility. + skipna : bool, default True + Whether to ignore NA values. Returns ------- scalar """ - nv.validate_max(args, kwargs) - return self._min_max("max") + nv.validate_minmax_axis(axis, self.ndim) + return self._min_max("max", skipna=skipna) - def min(self, axis: int = 0, *args, **kwargs) -> Scalar: + def min(self, *, axis: int | None = None, skipna: bool = True): """ - Min of non-NA/null values + Min of array values, ignoring NA values if specified. Parameters ---------- axis : int, default 0 Not Used. NumPy compatibility. - *args, **kwargs - Not Used. NumPy compatibility. + skipna : bool, default True + Whether to ignore NA values. Returns ------- scalar """ - nv.validate_min(args, kwargs) - return self._min_max("min") + nv.validate_minmax_axis(axis, self.ndim) + return self._min_max("min", skipna=skipna) - def _min_max(self, kind: Literal["min", "max"]) -> Scalar: + def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar: """ Min/max of non-NA/null values Parameters ---------- kind : {"min", "max"} + skipna : bool Returns ------- @@ -1551,6 +1549,7 @@ def _min_max(self, kind: Literal["min", "max"]) -> Scalar: """ valid_vals = self._valid_sp_values has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0 + if len(valid_vals) > 0: sp_min_max = getattr(valid_vals, kind)() @@ -1558,12 +1557,17 @@ def _min_max(self, kind: Literal["min", "max"]) -> Scalar: if has_nonnull_fill_vals: func = max if kind == "max" else min return func(sp_min_max, self.fill_value) - else: + elif skipna: return sp_min_max + elif self.sp_index.ngaps == 0: + # No NAs present + return sp_min_max + else: + return na_value_for_dtype(self.dtype.subtype, compat=False) elif has_nonnull_fill_vals: return self.fill_value else: - return na_value_for_dtype(self.dtype.subtype) + return na_value_for_dtype(self.dtype.subtype, compat=False) # ------------------------------------------------------------------------ # Ufuncs @@ -1590,7 +1594,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): sp_values = getattr(ufunc, method)(self.sp_values, **kwargs) fill_value = getattr(ufunc, method)(self.fill_value, **kwargs) - if isinstance(sp_values, tuple): + if ufunc.nout > 1: # multiple outputs. e.g. modf arrays = tuple( self._simple_new( @@ -1599,7 +1603,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): for sp_value, fv in zip(sp_values, fill_value) ) return arrays - elif is_scalar(sp_values): + elif method == "reduce": # e.g. reductions return sp_values @@ -1613,7 +1617,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): out = out[0] return out - if type(result) is tuple: + if ufunc.nout > 1: return tuple(type(self)(x) for x in result) elif method == "at": # no return value @@ -1649,7 +1653,6 @@ def _arith_method(self, other, op): else: other = np.asarray(other) with np.errstate(all="ignore"): - # TODO: look into _wrap_result if len(self) != len(other): raise AssertionError( f"length mismatch: {len(self)} vs. {len(other)}" diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index b242e13229519c..508be1eb664386 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -305,7 +305,7 @@ def is_dtype(cls, dtype: object) -> bool: return True return isinstance(dtype, np.dtype) or dtype == "Sparse" - def update_dtype(self, dtype): + def update_dtype(self, dtype) -> SparseDtype: """ Convert the SparseDtype to a new dtype. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index df71501d55b203..c6987d9a11e4c2 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -512,7 +512,9 @@ def _cmp_method(self, other, op): # ------------------------------------------------------------------------ # String methods interface - _str_na_value = StringDtype.na_value + # error: Incompatible types in assignment (expression has type "NAType", + # base class "PandasArray" defined the type as "float") + _str_na_value = StringDtype.na_value # type: ignore[assignment] def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 0a6bc97237ddd8..53fc38a9731100 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -367,7 +367,7 @@ def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] if isinstance(other, ArrowStringArray): result = pc_func(self._data, other._data) - elif isinstance(other, np.ndarray): + elif isinstance(other, (np.ndarray, list)): result = pc_func(self._data, other) elif is_scalar(other): try: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 8fe330d0d41ddb..4e58ebc518bb4b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -426,7 +426,7 @@ def _formatter(self, boxed: bool = False): @dtl.ravel_compat def _format_native_types( - self, na_rep="NaT", date_format=None, **kwargs + self, *, na_rep="NaT", date_format=None, **kwargs ) -> np.ndarray: from pandas.io.formats.format import get_format_timedelta64 diff --git a/pandas/core/base.py b/pandas/core/base.py index 9040414a8f35fa..45a9b92d94b62f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -235,7 +235,7 @@ def __getitem__(self, key): raise IndexError(f"Column(s) {self._selection} already selected") if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)): - if len(self.obj.columns.intersection(key)) != len(key): + if len(self.obj.columns.intersection(key)) != len(set(key)): bad_keys = list(set(key).difference(self.obj.columns)) raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") return self._gotitem(list(key), ndim=2) diff --git a/pandas/core/common.py b/pandas/core/common.py index 590296c4b12f5e..2ebdfccc88f4e4 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -18,8 +18,10 @@ Any, Callable, Collection, + Hashable, Iterable, Iterator, + Sequence, cast, overload, ) @@ -604,3 +606,22 @@ def is_builtin_func(arg): otherwise return the arg """ return _builtin_table.get(arg, arg) + + +def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]: + """ + If a name is missing then replace it by level_n, where n is the count + + .. versionadded:: 1.4.0 + + Parameters + ---------- + names : list-like + list of column names or None values. + + Returns + ------- + list + list of column names with the None values replaced. + """ + return [f"level_{i}" if name is None else name for i, name in enumerate(names)] diff --git a/pandas/core/computation/api.py b/pandas/core/computation/api.py index 31e8a4873b0ad4..bd3be5b3f8c422 100644 --- a/pandas/core/computation/api.py +++ b/pandas/core/computation/api.py @@ -1,3 +1,2 @@ -# flake8: noqa - +__all__ = ["eval"] from pandas.core.computation.eval import eval diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 426cd8fd81f284..d4fbe226a3ae28 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -133,18 +133,13 @@ def __init__( # shallow copy here because we don't want to replace what's in # scope when we align terms (alignment accesses the underlying # numpy array of pandas objects) - - # error: Incompatible types in assignment (expression has type - # "ChainMap[str, Any]", variable has type "DeepChainMap[str, Any]") - self.scope = self.scope.new_child( # type: ignore[assignment] - (global_dict or frame.f_globals).copy() - ) + scope_global = self.scope.new_child((global_dict or frame.f_globals).copy()) + self.scope = DeepChainMap(scope_global) if not isinstance(local_dict, Scope): - # error: Incompatible types in assignment (expression has type - # "ChainMap[str, Any]", variable has type "DeepChainMap[str, Any]") - self.scope = self.scope.new_child( # type: ignore[assignment] + scope_local = self.scope.new_child( (local_dict or frame.f_locals).copy() ) + self.scope = DeepChainMap(scope_local) finally: del frame @@ -257,9 +252,7 @@ def _get_vars(self, stack, scopes: list[str]) -> None: for scope, (frame, _, _, _, _, _) in variables: try: d = getattr(frame, "f_" + scope) - # error: Incompatible types in assignment (expression has type - # "ChainMap[str, Any]", variable has type "DeepChainMap[str, Any]") - self.scope = self.scope.new_child(d) # type: ignore[assignment] + self.scope = DeepChainMap(self.scope.new_child(d)) finally: # won't remove it, but DECREF it # in Py3 this probably isn't necessary since frame won't be diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 8261162afe1871..cf8cd070ec5627 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -528,6 +528,8 @@ def sanitize_array( # GH#846 if isinstance(data, np.ndarray): + if isinstance(data, np.matrix): + data = data.A if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py index 051affd0af1f9a..bb6bfda1838028 100644 --- a/pandas/core/dtypes/api.py +++ b/pandas/core/dtypes/api.py @@ -1,4 +1,4 @@ -# flake8: noqa +# flake8: noqa:F401 from pandas.core.dtypes.common import ( is_array_like, diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 79ea7731466d46..b70ea9f816aef4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -248,7 +248,7 @@ def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLi if isinstance(dtype, str): if dtype == "infer": - inferred_type = lib.infer_dtype(ensure_object(result), skipna=False) + inferred_type = lib.infer_dtype(result, skipna=False) if inferred_type == "boolean": dtype = "bool" elif inferred_type == "integer": @@ -912,6 +912,10 @@ def maybe_upcast( # We get a copy in all cases _except_ (values.dtype == new_dtype and not copy) upcast_values = values.astype(new_dtype, copy=copy) + # error: Incompatible return value type (got "Tuple[ndarray[Any, dtype[Any]], + # Union[Union[str, int, float, bool] Union[Period, Timestamp, Timedelta, Any]]]", + # expected "Tuple[NumpyArrayT, Union[Union[str, int, float, bool], Union[Period, + # Timestamp, Timedelta, Any]]]") return upcast_values, fill_value # type: ignore[return-value] diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 72cc28c1dd66d4..1b3c217f1293b1 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -613,7 +613,7 @@ def is_dtype_equal(source, target) -> bool: src = get_dtype(source) if isinstance(src, ExtensionDtype): return src == target - except (TypeError, AttributeError): + except (TypeError, AttributeError, ImportError): return False elif isinstance(source, str): return is_dtype_equal(target, source) @@ -622,7 +622,7 @@ def is_dtype_equal(source, target) -> bool: source = get_dtype(source) target = get_dtype(target) return source == target - except (TypeError, AttributeError): + except (TypeError, AttributeError, ImportError): # invalid comparison # object == category will hit this @@ -1318,7 +1318,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: except (TypeError, ValueError): return False - if isinstance(arr_or_dtype, CategoricalDtype): + if isinstance(dtype, CategoricalDtype): arr_or_dtype = arr_or_dtype.categories # now we use the special definition for Index @@ -1329,7 +1329,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: # so its object, we need to infer to # guess this return arr_or_dtype.is_object() and arr_or_dtype.inferred_type == "boolean" - elif is_extension_array_dtype(arr_or_dtype): + elif isinstance(dtype, ExtensionDtype): return getattr(dtype, "_is_boolean", False) return issubclass(dtype.type, np.bool_) @@ -1408,11 +1408,12 @@ def is_1d_only_ea_obj(obj: Any) -> bool: from pandas.core.arrays import ( DatetimeArray, ExtensionArray, + PeriodArray, TimedeltaArray, ) return isinstance(obj, ExtensionArray) and not isinstance( - obj, (DatetimeArray, TimedeltaArray) + obj, (DatetimeArray, TimedeltaArray, PeriodArray) ) @@ -1424,7 +1425,9 @@ def is_1d_only_ea_dtype(dtype: DtypeObj | None) -> bool: # here too. # NB: need to check DatetimeTZDtype and not is_datetime64tz_dtype # to exclude ArrowTimestampUSDtype - return isinstance(dtype, ExtensionDtype) and not isinstance(dtype, DatetimeTZDtype) + return isinstance(dtype, ExtensionDtype) and not isinstance( + dtype, (DatetimeTZDtype, PeriodDtype) + ) def is_extension_array_dtype(arr_or_dtype) -> bool: diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 71da0a4b20b410..e74d73b84e94b2 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -876,15 +876,15 @@ def freq(self): @classmethod def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset: - if isinstance(freq, str): + if isinstance(freq, str): # note: freq is already of type str! if freq.startswith("period[") or freq.startswith("Period["): m = cls._match.search(freq) if m is not None: freq = m.group("freq") - freq = to_offset(freq) - if freq is not None: - return freq + freq_offset = to_offset(freq) + if freq_offset is not None: + return freq_offset raise ValueError("could not construct PeriodDtype") diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 656441b6a51366..4e3306e84c1a14 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -241,7 +241,10 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False): if inf_as_na and is_categorical_dtype(dtype): result = libmissing.isnaobj(values.to_numpy(), inf_as_na=inf_as_na) else: - result = values.isna() + # error: Incompatible types in assignment (expression has type + # "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has + # type "ndarray[Any, dtype[bool_]]") + result = values.isna() # type: ignore[assignment] elif is_string_or_object_np_dtype(values.dtype): result = _isna_string_dtype(values, inf_as_na=inf_as_na) elif needs_i8_conversion(dtype): @@ -479,9 +482,17 @@ def _array_equivalent_datetimelike(left, right): return np.array_equal(left.view("i8"), right.view("i8")) -def _array_equivalent_object(left, right, strict_nan): +def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bool): if not strict_nan: # isna considers NaN and None to be equivalent. + + if left.flags["F_CONTIGUOUS"] and right.flags["F_CONTIGUOUS"]: + # we can improve performance by doing a copy-free ravel + # e.g. in frame_methods.Equals.time_frame_nonunique_equal + # if we transposed the frames + left = left.ravel("K") + right = right.ravel("K") + return lib.array_equivalent_object( ensure_object(left.ravel()), ensure_object(right.ravel()) ) @@ -501,10 +512,7 @@ def _array_equivalent_object(left, right, strict_nan): if np.any(np.asarray(left_value != right_value)): return False except TypeError as err: - if "Cannot compare tz-naive" in str(err): - # tzawareness compat failure, see GH#28507 - return False - elif "boolean value of NA is ambiguous" in str(err): + if "boolean value of NA is ambiguous" in str(err): return False raise return True diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 01f817300a01af..3cd787748738e5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2370,11 +2370,7 @@ def to_records( index_names = list(self.index.names) if isinstance(self.index, MultiIndex): - count = 0 - for i, n in enumerate(index_names): - if n is None: - index_names[i] = f"level_{count}" - count += 1 + index_names = com.fill_missing_names(index_names) elif index_names[0] is None: index_names = ["index"] @@ -5796,10 +5792,7 @@ class max type if not drop: to_insert: Iterable[tuple[Any, Any | None]] if isinstance(self.index, MultiIndex): - names = [ - (n if n is not None else f"level_{i}") - for i, n in enumerate(self.index.names) - ] + names = com.fill_missing_names(self.index.names) to_insert = zip(self.index.levels, self.index.codes) else: default = "index" if "index" not in self else "level_0" @@ -5998,14 +5991,15 @@ def dropna( raise KeyError(np.array(subset)[check].tolist()) agg_obj = self.take(indices, axis=agg_axis) - count = agg_obj.count(axis=agg_axis) - if thresh is not None: + count = agg_obj.count(axis=agg_axis) mask = count >= thresh elif how == "any": - mask = count == len(agg_obj._get_axis(agg_axis)) + # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]' + mask = notna(agg_obj).all(axis=agg_axis, bool_only=False) elif how == "all": - mask = count > 0 + # faster equivalent to 'agg_obj.count(agg_axis) > 0' + mask = notna(agg_obj).any(axis=agg_axis, bool_only=False) else: if how is not None: raise ValueError(f"invalid how option: {how}") @@ -6790,7 +6784,8 @@ def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFra 'columns' for column-wise.""" ), examples=dedent( - """Examples + """\ + Examples -------- >>> df = pd.DataFrame( ... {"Grade": ["A", "B", "A", "C"]}, @@ -10034,6 +10029,34 @@ def _get_data() -> DataFrame: result = self._constructor_sliced(result, index=labels) return result + def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: + """ + Special case for _reduce to try to avoid a potentially-expensive transpose. + + Apply the reduction block-wise along axis=1 and then reduce the resulting + 1D arrays. + """ + if name == "all": + result = np.ones(len(self), dtype=bool) + ufunc = np.logical_and + elif name == "any": + result = np.zeros(len(self), dtype=bool) + # error: Incompatible types in assignment + # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'], + # Literal[20], Literal[False]]", variable has type + # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20], + # Literal[True]]") + ufunc = np.logical_or # type: ignore[assignment] + else: + raise NotImplementedError(name) + + for arr in self._mgr.arrays: + middle = func(arr, axis=0, skipna=skipna) + result = ufunc(result, middle) + + res_ser = self._constructor_sliced(result, index=self.index) + return res_ser + def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ Count number of distinct elements in specified axis. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 57f151feeae80e..e66086faf53af8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -118,6 +118,7 @@ nanops, ) import pandas.core.algorithms as algos +from pandas.core.array_algos.replace import should_use_regex from pandas.core.arrays import ExtensionArray from pandas.core.base import PandasObject import pandas.core.common as com @@ -2567,7 +2568,7 @@ def to_json( "primaryKey": [ "index" ], - "pandas_version": "0.20.0" + "pandas_version": "1.4.0" }}, "data": [ {{ @@ -6688,9 +6689,17 @@ def replace( return self._replace_columnwise(mapping, inplace, regex) elif not is_list_like(value): # NA -> 0 - new_data = self._mgr.replace( - to_replace=to_replace, value=value, inplace=inplace, regex=regex - ) + regex = should_use_regex(regex, to_replace) + if regex: + new_data = self._mgr.replace_regex( + to_replace=to_replace, + value=value, + inplace=inplace, + ) + else: + new_data = self._mgr.replace( + to_replace=to_replace, value=value, inplace=inplace + ) else: raise TypeError( f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}' @@ -10349,6 +10358,21 @@ def _logical_func( ) return res._logical_func(name, func, skipna=skipna, **kwargs) + if ( + self.ndim > 1 + and axis == 1 + and len(self._mgr.arrays) > 1 + # TODO(EA2D): special-case not needed + and all(x.ndim == 2 for x in self._mgr.arrays) + and bool_only is not None + and not kwargs + ): + # Fastpath avoiding potentially expensive transpose + obj = self + if bool_only: + obj = self._get_bool_data() + return obj._reduce_axis1(name, func, skipna=skipna) + return self._reduce( func, name=name, diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 986aaa07a913c6..48faa1fc467594 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -143,6 +143,7 @@ class OutputKey: "take", "transform", "sample", + "value_counts", ] ) # Valid values of `name` for `groupby.transform(name)` diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4535010b29c3a1..9b341845c7170f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -17,6 +17,7 @@ Iterable, Mapping, NamedTuple, + Sequence, TypeVar, Union, cast, @@ -76,6 +77,7 @@ _transform_template, warn_dropping_nuisance_columns_deprecated, ) +from pandas.core.groupby.grouper import get_grouper from pandas.core.indexes.api import ( Index, MultiIndex, @@ -1569,6 +1571,193 @@ def func(df): boxplot = boxplot_frame_groupby + def value_counts( + self, + subset: Sequence[Hashable] | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrame | Series: + """ + Return a Series or DataFrame containing counts of unique rows. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don’t include counts of rows that contain NA values. + + Returns + ------- + Series or DataFrame + Series if the groupby as_index is True, otherwise DataFrame. + + See Also + -------- + Series.value_counts: Equivalent method on Series. + DataFrame.value_counts: Equivalent method on DataFrame. + SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. + + Notes + ----- + - If the groupby as_index is True then the returned Series will have a + MultiIndex with one level per input column. + - If the groupby as_index is False then the returned DataFrame will have an + additional column with the value_counts. The column is labelled 'count' or + 'proportion', depending on the ``normalize`` parameter. + + By default, rows that contain any NA values are omitted from + the result. + + By default, the result will be in descending order so that the + first element of each group is the most frequently-occurring row. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + >>> df.groupby('gender').value_counts() + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + dtype: int64 + + >>> df.groupby('gender').value_counts(ascending=True) + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + dtype: int64 + + >>> df.groupby('gender').value_counts(normalize=True) + gender education country + female high FR 0.50 + US 0.50 + male low FR 0.50 + US 0.25 + medium FR 0.25 + dtype: float64 + + >>> df.groupby('gender', as_index=False).value_counts() + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + gender education country proportion + 0 female high FR 0.50 + 1 female high US 0.50 + 2 male low FR 0.50 + 3 male low US 0.25 + 4 male medium FR 0.25 + """ + if self.axis == 1: + raise NotImplementedError( + "DataFrameGroupBy.value_counts only handles axis=0" + ) + + with self._group_selection_context(): + df = self.obj + + in_axis_names = { + grouping.name for grouping in self.grouper.groupings if grouping.in_axis + } + if isinstance(self._selected_obj, Series): + name = self._selected_obj.name + keys = [] if name in in_axis_names else [self._selected_obj] + else: + keys = [ + # Can't use .values because the column label needs to be preserved + self._selected_obj.iloc[:, idx] + for idx, name in enumerate(self._selected_obj.columns) + if name not in in_axis_names + ] + + if subset is not None: + clashing = set(subset) & set(in_axis_names) + if clashing: + raise ValueError( + f"Keys {clashing} in subset cannot be in " + "the groupby column keys" + ) + + groupings = list(self.grouper.groupings) + for key in keys: + grouper, _, _ = get_grouper( + df, + key=key, + axis=self.axis, + sort=self.sort, + dropna=dropna, + ) + groupings += list(grouper.groupings) + + # Take the size of the overall columns + gb = df.groupby( + groupings, + sort=self.sort, + observed=self.observed, + dropna=self.dropna, + ) + result = cast(Series, gb.size()) + + if normalize: + # Normalize the results by dividing by the original group sizes. + # We are guaranteed to have the first N levels be the + # user-requested grouping. + levels = list(range(len(self.grouper.groupings), result.index.nlevels)) + indexed_group_size = result.groupby( + result.index.droplevel(levels), + sort=self.sort, + observed=self.observed, + dropna=self.dropna, + ).transform("sum") + + result /= indexed_group_size + + if sort: + # Sort the values and then resort by the main grouping + index_level = range(len(self.grouper.groupings)) + result = result.sort_values(ascending=ascending).sort_index( + level=index_level, sort_remaining=False + ) + + if not self.as_index: + # Convert to frame + result = result.reset_index(name="proportion" if normalize else "count") + return result.__finalize__(self.obj, method="value_counts") + def _wrap_transform_general_frame( obj: DataFrame, group: DataFrame, res: DataFrame | Series diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2876ec1cb5a0d1..a1866e3bdc9f62 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1272,6 +1272,7 @@ def _numba_agg_general( func: Callable, engine_kwargs: dict[str, bool] | None, numba_cache_key_str: str, + *aggregator_args, ): """ Perform groupby with a standard numerical aggregation function (e.g. mean) @@ -1291,7 +1292,7 @@ def _numba_agg_general( aggregator = executor.generate_shared_aggregator( func, engine_kwargs, numba_cache_key_str ) - result = aggregator(sorted_data, starts, ends, 0) + result = aggregator(sorted_data, starts, ends, 0, *aggregator_args) cache_key = (func, numba_cache_key_str) if cache_key not in NUMBA_FUNC_CACHE: @@ -1989,7 +1990,12 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): @final @Substitution(name="groupby") @Appender(_common_see_also) - def std(self, ddof: int = 1): + def std( + self, + ddof: int = 1, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): """ Compute standard deviation of groups, excluding missing values. @@ -2000,23 +2006,52 @@ def std(self, ddof: int = 1): ddof : int, default 1 Degrees of freedom. + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + Returns ------- Series or DataFrame Standard deviation of values within each group. """ - return self._get_cythonized_result( - libgroupby.group_var, - needs_counts=True, - cython_dtype=np.dtype(np.float64), - post_processing=lambda vals, inference: np.sqrt(vals), - ddof=ddof, - ) + if maybe_use_numba(engine): + from pandas.core._numba.kernels import sliding_var + + return np.sqrt( + self._numba_agg_general(sliding_var, engine_kwargs, "groupby_std", ddof) + ) + else: + return self._get_cythonized_result( + libgroupby.group_var, + needs_counts=True, + cython_dtype=np.dtype(np.float64), + post_processing=lambda vals, inference: np.sqrt(vals), + ddof=ddof, + ) @final @Substitution(name="groupby") @Appender(_common_see_also) - def var(self, ddof: int = 1): + def var( + self, + ddof: int = 1, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): """ Compute variance of groups, excluding missing values. @@ -2027,20 +2062,46 @@ def var(self, ddof: int = 1): ddof : int, default 1 Degrees of freedom. + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + Returns ------- Series or DataFrame Variance of values within each group. """ - if ddof == 1: - numeric_only = self._resolve_numeric_only(lib.no_default) - return self._cython_agg_general( - "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only + if maybe_use_numba(engine): + from pandas.core._numba.kernels import sliding_var + + return self._numba_agg_general( + sliding_var, engine_kwargs, "groupby_var", ddof ) else: - func = lambda x: x.var(ddof=ddof) - with self._group_selection_context(): - return self._python_agg_general(func) + if ddof == 1: + numeric_only = self._resolve_numeric_only(lib.no_default) + return self._cython_agg_general( + "var", + alt=lambda x: Series(x).var(ddof=ddof), + numeric_only=numeric_only, + ) + else: + func = lambda x: x.var(ddof=ddof) + with self._group_selection_context(): + return self._python_agg_general(func) @final @Substitution(name="groupby") @@ -2102,22 +2163,35 @@ def size(self) -> DataFrame | Series: @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum( - self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + self, + numeric_only: bool | lib.NoDefault = lib.no_default, + min_count: int = 0, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, ): - numeric_only = self._resolve_numeric_only(numeric_only) + if maybe_use_numba(engine): + from pandas.core._numba.kernels import sliding_sum - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _agg_general() returns. GH #31422 - with com.temp_setattr(self, "observed", True): - result = self._agg_general( - numeric_only=numeric_only, - min_count=min_count, - alias="add", - npfunc=np.sum, + return self._numba_agg_general( + sliding_sum, + engine_kwargs, + "groupby_sum", ) + else: + numeric_only = self._resolve_numeric_only(numeric_only) - return self._reindex_output(result, fill_value=0) + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _agg_general() returns. GH #31422 + with com.temp_setattr(self, "observed", True): + result = self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="add", + npfunc=np.sum, + ) + + return self._reindex_output(result, fill_value=0) @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index a05f8e581d12fe..1e6515084d3b71 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -800,7 +800,7 @@ def get_grouper( # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) - any_groupers = any(isinstance(g, Grouper) for g in keys) + any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7915e107afae63..5e7882b8b4a1ef 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -500,9 +500,10 @@ def _call_cython_op( elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): - # e.g. uint8 -> uint64, int16 -> int64 - dtype_str = dtype.kind + "8" - values = values.astype(dtype_str, copy=False) + # GH#43329 If the dtype is explicitly of type uint64 the type is not + # changed to prevent overflow. + if dtype != np.uint64: + values = values.astype(np.int64, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9ce7c943214b3a..263a046f591217 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1066,6 +1066,8 @@ def take( ): if kwargs: nv.validate_take((), kwargs) + if is_scalar(indices): + raise TypeError("Expected indices to be array-like") indices = ensure_platform_int(indices) allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) @@ -1176,6 +1178,9 @@ def copy( names : list-like, optional Kept for compatibility with MultiIndex. Should not be used. + .. deprecated:: 1.4.0 + use ``name`` instead. + Returns ------- Index @@ -1186,6 +1191,14 @@ def copy( In most cases, there should be no functional difference from using ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ + if names is not None: + warnings.warn( + "parameter names is deprecated and will be removed in a future " + "version. Use the name parameter instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + name = self._validate_names(name=name, names=names, deep=deep)[0] if deep: new_data = self._data.copy() @@ -1385,7 +1398,7 @@ def to_native_types(self, slicer=None, **kwargs) -> np.ndarray: values = values[slicer] return values._format_native_types(**kwargs) - def _format_native_types(self, na_rep="", quoting=None, **kwargs): + def _format_native_types(self, *, na_rep="", quoting=None, **kwargs): """ Actually format specific types of the index. """ @@ -2721,13 +2734,18 @@ def fillna(self, value=None, downcast=None): DataFrame.fillna : Fill NaN values of a DataFrame. Series.fillna : Fill NaN Values of a Series. """ + value = self._require_scalar(value) if self.hasnans: result = self.putmask(self._isnan, value) if downcast is None: # no need to care metadata other than name - # because it can't have freq if + # because it can't have freq if it has NaTs return Index._with_infer(result, name=self.name) + raise NotImplementedError( + f"{type(self).__name__}.fillna does not support 'downcast' " + "argument values other than 'None'." + ) return self._view() def dropna(self: _IndexT, how: str_t = "any") -> _IndexT: @@ -4922,7 +4940,8 @@ def __getitem__(self, key): """ getitem = self._data.__getitem__ - if is_scalar(key): + if is_integer(key) or is_float(key): + # GH#44051 exclude bool, which would return a 2d ndarray key = com.cast_scalar_indexer(key, warn_float=True) return getitem(key) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f26a24c38b19fb..9ef43c740d6024 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -377,20 +377,6 @@ def __contains__(self, key: Any) -> bool: return contains(self, key, container=self._engine) - @doc(Index.fillna) - def fillna(self, value, downcast=None): - value = self._require_scalar(value) - try: - cat = self._data.fillna(value) - except (ValueError, TypeError): - # invalid fill_value - if not self.hasnans: - # nothing to fill, we can get away without casting - return self.copy() - return self.astype(object).fillna(value, downcast=downcast) - - return type(self)._simple_new(cat, name=self.name) - # TODO(2.0): remove reindex once non-unique deprecation is enforced def reindex( self, target, method=None, level=None, limit=None, tolerance=None diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 104bce0369d379..731efdc3b17f0d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -92,8 +92,6 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): freq: BaseOffset | None freqstr: str | None _resolution_obj: Resolution - _bool_ops: list[str] = [] - _field_ops: list[str] = [] # error: "Callable[[Any], Any]" has no attribute "fget" hasnans = cast( diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6d355f13750691..a378fd95b9c033 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -812,7 +812,7 @@ def _format_with_header(self, header: list[str], na_rep: str) -> list[str]: # matches base class except for whitespace padding return header + list(self._format_native_types(na_rep=na_rep)) - def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): + def _format_native_types(self, *, na_rep="NaN", quoting=None, **kwargs): # GH 28210: use base method but with different default na_rep return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 53d584f801b0fc..86d7e20a551a10 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -736,11 +736,9 @@ def dtypes(self) -> Series: """ from pandas import Series + names = com.fill_missing_names([level.name for level in self.levels]) return Series( - { - f"level_{idx}" if level.name is None else level.name: level.dtype - for idx, level in enumerate(self.levels) - } + {names[idx]: level.dtype for idx, level in enumerate(self.levels)} ) def __len__(self) -> int: @@ -1283,7 +1281,7 @@ def _formatter_func(self, tup): formatter_funcs = [level._formatter_func for level in self.levels] return tuple(func(val) for func, val in zip(formatter_funcs, tup)) - def _format_native_types(self, na_rep="nan", **kwargs): + def _format_native_types(self, *, na_rep="nan", **kwargs): new_levels = [] new_codes = [] @@ -3593,7 +3591,7 @@ def _union(self, other, sort) -> MultiIndex: rvals = other._values.astype(object, copy=False) result = lib.fast_unique_multiple([self._values, rvals], sort=sort) - return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) + return MultiIndex.from_arrays(zip(*result), sortorder=None, names=result_names) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return is_object_dtype(dtype) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 477d4bfc3290bd..bb25813e9742b5 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -343,7 +343,7 @@ def _is_all_dates(self) -> bool: return False def _format_native_types( - self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs + self, *, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs ): from pandas.io.formats.format import FloatArrayFormatter diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index e3e1589d91e095..8dcd379a4eb9b3 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -230,6 +230,10 @@ def __new__( if data is None and ordinal is None: # range-based. + if not fields: + # test_pickle_compat_construction + raise cls._scalar_data_error(None) + data, freq2 = PeriodArray._generate_range(None, None, None, freq, fields) # PeriodArray._generate range does validation that fields is # empty when really using the range-based constructor. diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index fdb1ee754a7e61..887c8da6305dd1 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1041,7 +1041,11 @@ def _arith_method(self, other, op): rstop = op(left.stop, right) res_name = ops.get_op_result_name(self, other) - result = type(self)(rstart, rstop, rstep, name=res_name) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + # The constructor validation can lead to a DeprecationWarning + # from numpy, e.g. with RangeIndex + np.datetime64("now") + result = type(self)(rstart, rstop, rstep, name=res_name) # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index fc2204724aceb0..f043a8cee308cc 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2058,6 +2058,8 @@ def ravel(i): # we have a frame, with multiple indexers on both axes; and a # series, so need to broadcast (see GH5206) if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer): + if is_empty_indexer(indexer[0], ser._values): + return ser._values.copy() ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values # single indexer diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index fe0b36a8ef4d1f..537ae8f2a43209 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -15,6 +15,7 @@ from pandas.core.dtypes.common import ( is_datetime64tz_dtype, + is_period_dtype, pandas_dtype, ) @@ -62,8 +63,9 @@ def make_block( placement = BlockPlacement(placement) ndim = maybe_infer_ndim(values, placement, ndim) - if is_datetime64tz_dtype(values.dtype): + if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype): # GH#41168 ensure we can pass 1D dt64tz values + # More generally, any EA dtype that isn't is_1d_only_ea_dtype values = extract_array(values, extract_numpy=True) values = ensure_block_shape(values, ndim) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index fcd5cd0979252c..09f16a2ddab671 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -413,11 +413,17 @@ def _convert(arr): return self.apply(_convert) - def replace(self: T, value, **kwargs) -> T: + def replace_regex(self: T, **kwargs) -> T: + return self.apply_with_block("_replace_regex", **kwargs) + + def replace(self: T, to_replace, value, inplace: bool) -> T: + inplace = validate_bool_kwarg(inplace, "inplace") assert np.ndim(value) == 0, value # TODO "replace" is right now implemented on the blocks, we should move # it to general array algos so it can be reused here - return self.apply_with_block("replace", value=value, **kwargs) + return self.apply_with_block( + "replace", value=value, to_replace=to_replace, inplace=inplace + ) def replace_list( self: T, @@ -430,7 +436,7 @@ def replace_list( inplace = validate_bool_kwarg(inplace, "inplace") return self.apply_with_block( - "_replace_list", + "replace_list", src_list=src_list, dest_list=dest_list, inplace=inplace, @@ -527,8 +533,8 @@ def copy_func(ax): if deep: new_arrays = [arr.copy() for arr in self.arrays] else: - new_arrays = self.arrays - return type(self)(new_arrays, new_axes) + new_arrays = list(self.arrays) + return type(self)(new_arrays, new_axes, verify_integrity=False) def reindex_indexer( self: T, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 587b9593e58fc4..abbebcefc7a87e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -58,6 +58,7 @@ CategoricalDtype, ExtensionDtype, PandasDtype, + PeriodDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -639,32 +640,26 @@ def replace( to_replace, value, inplace: bool = False, - regex: bool = False, + # mask may be pre-computed if we're called from replace_list + mask: npt.NDArray[np.bool_] | None = None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new - blocks here this is just a call to putmask. regex is not used here. - It is used in ObjectBlocks. It is here for API compatibility. + blocks here this is just a call to putmask. """ - inplace = validate_bool_kwarg(inplace, "inplace") # Note: the checks we do in NDFrame.replace ensure we never get # here with listlike to_replace or value, as those cases - # go through _replace_list + # go through replace_list values = self.values if isinstance(values, Categorical): # TODO: avoid special-casing blk = self if inplace else self.copy() - blk.values.replace(to_replace, value, inplace=True) + blk.values._replace(to_replace=to_replace, value=value, inplace=True) return [blk] - regex = should_use_regex(regex, to_replace) - - if regex: - return self._replace_regex(to_replace, value, inplace=inplace) - if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that # replacing it is a no-op. @@ -672,7 +667,8 @@ def replace( # replace_list instead of replace. return [self] if inplace else [self.copy()] - mask = missing.mask_missing(values, to_replace) + if mask is None: + mask = missing.mask_missing(values, to_replace) if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. @@ -690,13 +686,13 @@ def replace( to_replace=to_replace, value=value, inplace=True, - regex=regex, + mask=mask, ) else: # split so that we only upcast where necessary return self.split_and_operate( - type(self).replace, to_replace, value, inplace=True, regex=regex + type(self).replace, to_replace, value, inplace=True ) @final @@ -742,7 +738,7 @@ def _replace_regex( return [block] @final - def _replace_list( + def replace_list( self, src_list: Iterable[Any], dest_list: Sequence[Any], @@ -750,16 +746,10 @@ def _replace_list( regex: bool = False, ) -> list[Block]: """ - See BlockManager._replace_list docstring. + See BlockManager.replace_list docstring. """ values = self.values - # TODO: dont special-case Categorical - if isinstance(values, Categorical) and len(algos.unique(dest_list)) == 1: - # We likely got here by tiling value inside NDFrame.replace, - # so un-tile here - return self.replace(src_list, dest_list[0], inplace, regex) - # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -848,25 +838,18 @@ def _replace_coerce( ------- List[Block] """ - if mask.any(): - if not regex: - nb = self.coerce_to_target_dtype(value) - if nb is self and not inplace: - nb = nb.copy() - putmask_inplace(nb.values, mask, value) - return [nb] - else: - regex = should_use_regex(regex, to_replace) - if regex: - return self._replace_regex( - to_replace, - value, - inplace=inplace, - convert=False, - mask=mask, - ) - return self.replace(to_replace, value, inplace=inplace, regex=False) - return [self] + if should_use_regex(regex, to_replace): + return self._replace_regex( + to_replace, + value, + inplace=inplace, + convert=False, + mask=mask, + ) + else: + return self.replace( + to_replace=to_replace, value=value, inplace=inplace, mask=mask + ) # --------------------------------------------------------------------- @@ -1252,7 +1235,7 @@ def _unstack( unstacker, fill_value, new_placement: npt.NDArray[np.intp], - allow_fill: bool, + needs_masking: npt.NDArray[np.bool_], ): """ Return a list of unstacked blocks of self @@ -1264,6 +1247,7 @@ def _unstack( Only used in ExtensionBlock._unstack new_placement : np.ndarray[np.intp] allow_fill : bool + needs_masking : np.ndarray[bool] Returns ------- @@ -1456,7 +1440,8 @@ def iget(self, col): def set_inplace(self, locs, values) -> None: # NB: This is a misnomer, is supposed to be inplace but is not, # see GH#33457 - assert locs.tolist() == [0] + # When an ndarray, we should have locs.tolist() == [0] + # When a BlockPlacement we should have list(locs) == [0] self.values = values try: # TODO(GH33457) this can be removed @@ -1673,7 +1658,7 @@ def _unstack( unstacker, fill_value, new_placement: npt.NDArray[np.intp], - allow_fill: bool, + needs_masking: npt.NDArray[np.bool_], ): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the @@ -1692,14 +1677,20 @@ def _unstack( new_values = new_values.T[mask] new_placement = new_placement[mask] + # needs_masking[i] calculated once in BlockManager.unstack tells + # us if there are any -1s in the relevant indices. When False, + # that allows us to go through a faster path in 'take', among + # other things avoiding e.g. Categorical._validate_scalar. blocks = [ # TODO: could cast to object depending on fill_value? type(self)( - self.values.take(indices, allow_fill=allow_fill, fill_value=fill_value), + self.values.take( + indices, allow_fill=needs_masking[i], fill_value=fill_value + ), BlockPlacement(place), ndim=2, ) - for indices, place in zip(new_values, new_placement) + for i, (indices, place) in enumerate(zip(new_values, new_placement)) ] return blocks, mask @@ -1720,6 +1711,12 @@ class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock values: NDArrayBackedExtensionArray + # error: Signature of "is_extension" incompatible with supertype "Block" + @cache_readonly + def is_extension(self) -> bool: # type: ignore[override] + # i.e. datetime64tz, PeriodDtype + return not isinstance(self.dtype, np.dtype) + @property def is_view(self) -> bool: """return a boolean if I am possibly a view""" @@ -1748,6 +1745,9 @@ def where(self, other, cond) -> list[Block]: try: res_values = arr.T._where(cond, other).T except (ValueError, TypeError): + if isinstance(self.dtype, PeriodDtype): + # TODO: don't special-case + raise blk = self.coerce_to_target_dtype(other) nbs = blk.where(other, cond) return self._maybe_downcast(nbs, "infer") @@ -1941,6 +1941,8 @@ def get_block_type(dtype: DtypeObj): cls = CategoricalBlock elif vtype is Timestamp: cls = DatetimeTZBlock + elif isinstance(dtype, PeriodDtype): + cls = NDArrayBackedExtensionBlock elif isinstance(dtype, ExtensionDtype): # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f14f3c4a38430e..782842d167570a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -77,10 +77,16 @@ def _concatenate_array_managers( # reindex all arrays mgrs = [] for mgr, indexers in mgrs_indexers: + axis1_made_copy = False for ax, indexer in indexers.items(): mgr = mgr.reindex_indexer( axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True ) + if ax == 1 and indexer is not None: + axis1_made_copy = True + if copy and concat_axis == 0 and not axis1_made_copy: + # for concat_axis 1 we will always get a copy through concat_arrays + mgr = mgr.copy() mgrs.append(mgr) if concat_axis == 1: @@ -94,8 +100,6 @@ def _concatenate_array_managers( # concatting along the columns -> combine reindexed arrays in a single manager assert concat_axis == 0 arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) - if copy: - arrays = [x.copy() for x in arrays] new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) return new_mgr diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c55305e2d69b9b..532309dfc40b30 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -176,7 +176,7 @@ def rec_array_to_mgr( # essentially process a record array then fill it fdata = ma.getdata(data) if index is None: - index = _get_names_from_index(fdata) + index = default_index(len(fdata)) else: index = ensure_index(index) @@ -318,7 +318,7 @@ def ndarray_to_mgr( return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): - # i.e. Datetime64TZ + # i.e. Datetime64TZ, PeriodDtype values = extract_array(values, extract_numpy=True) if copy: values = values.copy() diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d69709bf9d06c3..5ebc0292f24b45 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -432,12 +432,18 @@ def convert( timedelta=timedelta, ) - def replace(self: T, to_replace, value, inplace: bool, regex: bool) -> T: - assert np.ndim(value) == 0, value + def replace(self: T, to_replace, value, inplace: bool) -> T: + inplace = validate_bool_kwarg(inplace, "inplace") + # NDFrame.replace ensures the not-is_list_likes here + assert not is_list_like(to_replace) + assert not is_list_like(value) return self.apply( - "replace", to_replace=to_replace, value=value, inplace=inplace, regex=regex + "replace", to_replace=to_replace, value=value, inplace=inplace ) + def replace_regex(self, **kwargs): + return self.apply("_replace_regex", **kwargs) + def replace_list( self: T, src_list: list[Any], @@ -449,7 +455,7 @@ def replace_list( inplace = validate_bool_kwarg(inplace, "inplace") bm = self.apply( - "_replace_list", + "replace_list", src_list=src_list, dest_list=dest_list, inplace=inplace, @@ -1066,22 +1072,12 @@ def iset( # Note: we exclude DTA/TDA here value_is_extension_type = is_1d_only_ea_dtype(value.dtype) - - # categorical/sparse/datetimetz - if value_is_extension_type: - - def value_getitem(placement): - return value - - else: + if not value_is_extension_type: if value.ndim == 2: value = value.T else: value = ensure_block_shape(value, ndim=2) - def value_getitem(placement): - return value[placement.indexer] - if value.shape[1:] != self.shape[1:]: raise AssertionError( "Shape of new values must be compatible with manager shape" @@ -1092,11 +1088,37 @@ def value_getitem(placement): # In this case, get_blkno_placements will yield only one tuple, # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) + # Check if we can use _iset_single fastpath + blkno = self.blknos[loc] + blk = self.blocks[blkno] + if len(blk._mgr_locs) == 1: # TODO: fastest way to check this? + return self._iset_single( + # error: Argument 1 to "_iset_single" of "BlockManager" has + # incompatible type "Union[int, slice, ndarray[Any, Any]]"; + # expected "int" + loc, # type:ignore[arg-type] + value, + inplace=inplace, + blkno=blkno, + blk=blk, + ) + # error: Incompatible types in assignment (expression has type # "List[Union[int, slice, ndarray]]", variable has type "Union[int, # slice, ndarray]") loc = [loc] # type: ignore[assignment] + # categorical/sparse/datetimetz + if value_is_extension_type: + + def value_getitem(placement): + return value + + else: + + def value_getitem(placement): + return value[placement.indexer] + # Accessing public blknos ensures the public versions are initialized blknos = self.blknos[loc] blklocs = self.blklocs[loc].copy() @@ -1172,6 +1194,29 @@ def value_getitem(placement): # Newly created block's dtype may already be present. self._known_consolidated = False + def _iset_single( + self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block + ) -> None: + """ + Fastpath for iset when we are only setting a single position and + the Block currently in that position is itself single-column. + + In this case we can swap out the entire Block and blklocs and blknos + are unaffected. + """ + # Caller is responsible for verifying value.shape + + if inplace and blk.should_store(value): + iloc = self.blklocs[loc] + blk.set_inplace(slice(iloc, iloc + 1), value) + return + + nb = new_block_2d(value, placement=blk._mgr_locs) + old_blocks = self.blocks + new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] + self.blocks = new_blocks + return + def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: """ Insert item at selected position. @@ -1197,8 +1242,13 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: bp = BlockPlacement(slice(loc, loc + 1)) block = new_block_2d(values=value, placement=bp) - self._insert_update_mgr_locs(loc) - self._insert_update_blklocs_and_blknos(loc) + if not len(self.blocks): + # Fastpath + self._blklocs = np.array([0], dtype=np.intp) + self._blknos = np.array([0], dtype=np.intp) + else: + self._insert_update_mgr_locs(loc) + self._insert_update_blklocs_and_blknos(loc) self.axes[0] = new_axis self.blocks += (block,) @@ -1425,7 +1475,14 @@ def unstack(self, unstacker, fill_value) -> BlockManager: new_columns = unstacker.get_new_columns(self.items) new_index = unstacker.new_index - allow_fill = not unstacker.mask.all() + allow_fill = not unstacker.mask_all + if allow_fill: + # calculating the full mask once and passing it to Block._unstack is + # faster than letting calculating it in each repeated call + new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape) + needs_masking = new_mask2D.any(axis=0) + else: + needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool) new_blocks: list[Block] = [] columns_mask: list[np.ndarray] = [] @@ -1445,7 +1502,7 @@ def unstack(self, unstacker, fill_value) -> BlockManager: unstacker, fill_value, new_placement=new_placement, - allow_fill=allow_fill, + needs_masking=needs_masking, ) new_blocks.extend(blocks) diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index d21c80b81b582c..57bacba0d4bee8 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -12,8 +12,8 @@ def kleene_or( - left: bool | np.ndarray, - right: bool | np.ndarray, + left: bool | np.ndarray | libmissing.NAType, + right: bool | np.ndarray | libmissing.NAType, left_mask: np.ndarray | None, right_mask: np.ndarray | None, ): @@ -37,12 +37,13 @@ def kleene_or( The result of the logical or, and the new mask. """ # To reduce the number of cases, we ensure that `left` & `left_mask` - # always come from an array, not a scalar. This is safe, since because + # always come from an array, not a scalar. This is safe, since # A | B == B | A if left_mask is None: return kleene_or(right, left, right_mask, left_mask) - assert isinstance(left, np.ndarray) + if not isinstance(left, np.ndarray): + raise TypeError("Either `left` or `right` need to be a np.ndarray.") raise_for_nan(right, method="or") @@ -73,8 +74,8 @@ def kleene_or( def kleene_xor( - left: bool | np.ndarray, - right: bool | np.ndarray, + left: bool | np.ndarray | libmissing.NAType, + right: bool | np.ndarray | libmissing.NAType, left_mask: np.ndarray | None, right_mask: np.ndarray | None, ): @@ -99,16 +100,20 @@ def kleene_xor( result, mask: ndarray[bool] The result of the logical xor, and the new mask. """ + # To reduce the number of cases, we ensure that `left` & `left_mask` + # always come from an array, not a scalar. This is safe, since + # A ^ B == B ^ A if left_mask is None: return kleene_xor(right, left, right_mask, left_mask) + if not isinstance(left, np.ndarray): + raise TypeError("Either `left` or `right` need to be a np.ndarray.") + raise_for_nan(right, method="xor") if right is libmissing.NA: result = np.zeros_like(left) else: - # error: Incompatible types in assignment (expression has type - # "Union[bool, Any]", variable has type "ndarray") - result = left ^ right # type: ignore[assignment] + result = left ^ right if right_mask is None: if right is libmissing.NA: @@ -146,12 +151,13 @@ def kleene_and( The result of the logical xor, and the new mask. """ # To reduce the number of cases, we ensure that `left` & `left_mask` - # always come from an array, not a scalar. This is safe, since because - # A | B == B | A + # always come from an array, not a scalar. This is safe, since + # A & B == B & A if left_mask is None: return kleene_and(right, left, right_mask, left_mask) - assert isinstance(left, np.ndarray) + if not isinstance(left, np.ndarray): + raise TypeError("Either `left` or `right` need to be a np.ndarray.") raise_for_nan(right, method="and") if right is libmissing.NA: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f132dd88d51475..e00defcfcffd10 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2012,30 +2012,30 @@ def _adjust_dates_anchored( if closed == "right": if foffset > 0: # roll back - fresult = first.value - foffset + fresult_int = first.value - foffset else: - fresult = first.value - freq.nanos + fresult_int = first.value - freq.nanos if loffset > 0: # roll forward - lresult = last.value + (freq.nanos - loffset) + lresult_int = last.value + (freq.nanos - loffset) else: # already the end of the road - lresult = last.value + lresult_int = last.value else: # closed == 'left' if foffset > 0: - fresult = first.value - foffset + fresult_int = first.value - foffset else: # start of the road - fresult = first.value + fresult_int = first.value if loffset > 0: # roll forward - lresult = last.value + (freq.nanos - loffset) + lresult_int = last.value + (freq.nanos - loffset) else: - lresult = last.value + freq.nanos - fresult = Timestamp(fresult) - lresult = Timestamp(lresult) + lresult_int = last.value + freq.nanos + fresult = Timestamp(fresult_int) + lresult = Timestamp(lresult_int) if first_tzinfo is not None: fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo) if last_tzinfo is not None: diff --git a/pandas/core/reshape/api.py b/pandas/core/reshape/api.py index 58d741c2c69889..bffdadb96c9723 100644 --- a/pandas/core/reshape/api.py +++ b/pandas/core/reshape/api.py @@ -1,4 +1,4 @@ -# flake8: noqa +# flake8: noqa:F401 from pandas.core.reshape.concat import concat from pandas.core.reshape.melt import ( diff --git a/pandas/core/series.py b/pandas/core/series.py index ab6550a48bc311..77bc816fd52a11 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3897,7 +3897,8 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: Whether to copy underlying data.""" ), examples=dedent( - """Examples + """\ + Examples -------- >>> s = pd.Series( ... ["A", "B", "A", "C"], diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index b5f3af5af8e38b..d5abd1606edecb 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1212,8 +1212,8 @@ def contains(self, pat, case=True, flags=0, na=None, regex=True): """ if regex and re.compile(pat).groups: warnings.warn( - "This pattern has match groups. To actually get the " - "groups, use str.extract.", + "This pattern is interpreted as a regular expression, and has " + "match groups. To actually get the groups, use str.extract.", UserWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 2ce5c0cbea2720..6b0380a292f07e 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -193,7 +193,7 @@ def rep(x, r): return result def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None ): if not case: flags |= re.IGNORECASE @@ -208,7 +208,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar = None, + na: Scalar | None = None, ): if not case: flags |= re.IGNORECASE diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index fc3390ee6db036..defae3392bfcec 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -747,8 +747,11 @@ def _apply_pairwise( target = self._create_data(target) result = super()._apply_pairwise(target, other, pairwise, func) # 1) Determine the levels + codes of the groupby levels - if other is not None: - # When we have other, we must reindex (expand) the result + if other is not None and not all( + len(group) == len(other) for group in self._grouper.indices.values() + ): + # GH 42915 + # len(other) != len(any group), so must reindex (expand) the result # from flex_binary_moment to a "transform"-like result # per groupby combination old_result_len = len(result) @@ -770,10 +773,9 @@ def _apply_pairwise( codes, levels = factorize(labels) groupby_codes.append(codes) groupby_levels.append(levels) - else: - # When we evaluate the pairwise=True result, repeat the groupby - # labels by the number of columns in the original object + # pairwise=True or len(other) == len(each group), so repeat + # the groupby labels by the number of columns in the original object groupby_codes = self._grouper.codes # error: Incompatible types in assignment (expression has type # "List[Index]", variable has type "List[Union[ndarray, Index]]") diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 8fae2d1d1179d4..cbe94673a81224 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -1,12 +1,10 @@ -# flake8: noqa - """ Expose public exceptions & warnings """ -from pandas._config.config import OptionError +from pandas._config.config import OptionError # noqa:F401 -from pandas._libs.tslibs import ( +from pandas._libs.tslibs import ( # noqa:F401 OutOfBoundsDatetime, OutOfBoundsTimedelta, ) diff --git a/pandas/io/common.py b/pandas/io/common.py index 844304396a23f5..e12a7348b0075c 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,7 +6,6 @@ from collections import abc import dataclasses import gzip -import io from io import ( BufferedIOBase, BytesIO, @@ -18,7 +17,6 @@ import mmap import os from pathlib import Path -import tempfile from typing import ( IO, Any, @@ -104,7 +102,7 @@ def close(self) -> None: avoid closing the potentially user-created buffer. """ if self.is_wrapped: - assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper)) + assert isinstance(self.handle, TextIOWrapper) self.handle.flush() self.handle.detach() self.created_handles.remove(self.handle) @@ -779,20 +777,17 @@ def get_handle( # Convert BytesIO or file objects passed with an encoding is_wrapped = False if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase): - handle = BytesIOWrapper( + # not added to handles as it does not open/buffer resources + handle = _BytesIOWrapper( handle, encoding=ioargs.encoding, ) - handles.append(handle) - # the (text) handle is always provided by the caller - # since get_handle would have opened it in binary mode - is_wrapped = True elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)): handle = TextIOWrapper( # error: Argument 1 to "TextIOWrapper" has incompatible type # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; # expected "IO[bytes]" - handle, # type: ignore[arg-type] + _IOWrapper(handle), # type: ignore[arg-type] encoding=ioargs.encoding, errors=errors, newline="", @@ -935,7 +930,7 @@ def __init__( self.decode = decode self.attributes = {} - for attribute in ("seekable", "readable", "writeable"): + for attribute in ("seekable", "readable"): if not hasattr(f, attribute): continue self.attributes[attribute] = getattr(f, attribute)() @@ -976,11 +971,40 @@ def __next__(self) -> str: return newline.lstrip("\n") -# Wrapper that wraps a StringIO buffer and reads bytes from it -# Created for compat with pyarrow read_csv -class BytesIOWrapper(io.BytesIO): - buffer: StringIO | TextIOBase | None +class _IOWrapper: + # TextIOWrapper is overly strict: it request that the buffer has seekable, readable, + # and writable. If we have a read-only buffer, we shouldn't need writable and vice + # versa. Some buffers, are seek/read/writ-able but they do not have the "-able" + # methods, e.g., tempfile.SpooledTemporaryFile. + # If a buffer does not have the above "-able" methods, we simple assume they are + # seek/read/writ-able. + def __init__(self, buffer: BaseBuffer): + self.buffer = buffer + + def __getattr__(self, name: str): + return getattr(self.buffer, name) + + def readable(self) -> bool: + if hasattr(self.buffer, "readable"): + # error: "BaseBuffer" has no attribute "readable" + return self.buffer.readable() # type: ignore[attr-defined] + return True + + def seekable(self) -> bool: + if hasattr(self.buffer, "seekable"): + return self.buffer.seekable() + return True + + def writable(self) -> bool: + if hasattr(self.buffer, "writable"): + # error: "BaseBuffer" has no attribute "writable" + return self.buffer.writable() # type: ignore[attr-defined] + return True + +class _BytesIOWrapper: + # Wrapper that wraps a StringIO buffer and reads bytes from it + # Created for compat with pyarrow read_csv def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"): self.buffer = buffer self.encoding = encoding @@ -1006,15 +1030,6 @@ def read(self, n: int | None = -1) -> bytes: self.overflow = combined_bytestring[n:] return to_return - def detach(self): - # Slightly modified from Python's TextIOWrapper detach method - if self.buffer is None: - raise ValueError("buffer is already detached") - self.flush() - buffer = self.buffer - self.buffer = None - return buffer - def _maybe_memory_map( handle: str | BaseBuffer, @@ -1040,26 +1055,20 @@ def _maybe_memory_map( handle = open(handle, mode) handles.append(handle) + # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], + # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" try: - # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], - # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" wrapped = cast( BaseBuffer, _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] ) - # error: "BaseBuffer" has no attribute "close" - handle.close() # type: ignore[attr-defined] - handles.remove(handle) - handles.append(wrapped) - handle = wrapped - except Exception: - # we catch any errors that may have occurred - # because that is consistent with the lower-level - # functionality of the C engine (pd.read_csv), so - # leave the file handler as is then - memory_map = False + finally: + for handle in reversed(handles): + # error: "BaseBuffer" has no attribute "close" + handle.close() # type: ignore[attr-defined] + handles.append(wrapped) - return handle, memory_map, handles + return wrapped, memory_map, handles def file_exists(filepath_or_buffer: FilePath | BaseBuffer) -> bool: @@ -1088,8 +1097,6 @@ def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool: codecs.StreamWriter, codecs.StreamReader, codecs.StreamReaderWriter, - # cannot be wrapped in TextIOWrapper GH43439 - tempfile.SpooledTemporaryFile, ) if issubclass(type(handle), text_classes): return False diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 18228a93b52851..d2cc77af8eee55 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -26,6 +26,7 @@ StorageOptions, WriteBuffer, ) +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -175,7 +176,7 @@ def _number_format(self) -> dict[str, Any]: "decimal": self.decimal, } - @property + @cache_readonly def data_index(self) -> Index: data_index = self.obj.index if ( @@ -185,6 +186,8 @@ def data_index(self) -> Index: data_index = Index( [x.strftime(self.date_format) if notna(x) else "" for x in data_index] ) + elif isinstance(data_index, ABCMultiIndex): + data_index = data_index.remove_unused_levels() return data_index @property diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 62f542de3437fc..21d89f18d49591 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -68,8 +68,6 @@ loads = json.loads dumps = json.dumps -TABLE_SCHEMA_VERSION = "0.20.0" - # interface to/from def to_json( @@ -565,7 +563,7 @@ def read_json( {{"name":"col 1","type":"string"}},\ {{"name":"col 2","type":"string"}}],\ "primaryKey":["index"],\ -"pandas_version":"0.20.0"}},\ +"pandas_version":"1.4.0"}},\ "data":[\ {{"index":"row 1","col 1":"a","col 2":"b"}},\ {{"index":"row 2","col 1":"c","col 2":"d"}}]\ diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 75fd950cd6076d..cb2d426f6b81bd 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -18,11 +18,13 @@ JSONSerializable, ) +from pandas.core.dtypes.base import _registry as registry from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype, @@ -40,6 +42,8 @@ loads = json.loads +TABLE_SCHEMA_VERSION = "1.4.0" + def as_json_table_type(x: DtypeObj) -> str: """ @@ -83,6 +87,8 @@ def as_json_table_type(x: DtypeObj) -> str: return "duration" elif is_categorical_dtype(x): return "any" + elif is_extension_array_dtype(x): + return "any" elif is_string_dtype(x): return "string" else: @@ -103,11 +109,7 @@ def set_default_names(data): data = data.copy() if data.index.nlevels > 1: - names = [ - name if name is not None else f"level_{i}" - for i, name in enumerate(data.index.names) - ] - data.index.names = names + data.index.names = com.fill_missing_names(data.index.names) else: data.index.name = data.index.name or "index" return data @@ -134,6 +136,8 @@ def convert_pandas_type_to_json_field(arr): field["freq"] = dtype.freq.freqstr elif is_datetime64tz_dtype(dtype): field["tz"] = dtype.tz.zone + elif is_extension_array_dtype(dtype): + field["extDtype"] = dtype.name return field @@ -199,6 +203,8 @@ def convert_json_field_to_pandas_type(field): return CategoricalDtype( categories=field["constraints"]["enum"], ordered=field["ordered"] ) + elif "extDtype" in field: + return registry.find(field["extDtype"]) else: return "object" @@ -257,7 +263,7 @@ def build_table_schema( {'name': 'B', 'type': 'string'}, \ {'name': 'C', 'type': 'datetime'}], \ 'primaryKey': ['idx'], \ -'pandas_version': '0.20.0'} +'pandas_version': '1.4.0'} """ if index is True: data = set_default_names(data) @@ -291,7 +297,7 @@ def build_table_schema( schema["primaryKey"] = primary_key if version: - schema["pandas_version"] = "0.20.0" + schema["pandas_version"] = TABLE_SCHEMA_VERSION return schema diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 98d1315c6212c4..96f7f9b1738b8d 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -130,7 +130,11 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: - frame = frame.astype(self.kwds.get("dtype")) + try: + frame = frame.astype(self.kwds.get("dtype")) + except TypeError as e: + # GH#44901 reraise to keep api consistent + raise ValueError(e) return frame def read(self) -> DataFrame: diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5d03529654b0d5..b7693832818804 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -42,7 +42,6 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( ensure_object, - ensure_str, is_bool_dtype, is_categorical_dtype, is_dict_like, @@ -391,22 +390,16 @@ def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) columns = list(zip(*(extract(r) for r in header))) - names = ic + columns - - # If we find unnamed columns all in a single - # level, then our header was too long. - for n in range(len(columns[0])): - if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): - header = ",".join([str(x) for x in self.header]) - raise ParserError( - f"Passed header=[{header}] are too many rows " - "for this multi_index of columns" - ) + names = columns.copy() + for single_ic in sorted(ic): + names.insert(single_ic, single_ic) # Clean the column names (if we have an index_col). if len(ic): col_names = [ - r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols) else None + r[ic[0]] + if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols) + else None for r in header ] else: diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 82f8ee553df8eb..47bc7ff95669b8 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -19,6 +19,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import ( ArrayLike, + CompressionOptions, DtypeArg, FilePath, ReadCsvBuffer, @@ -618,7 +619,7 @@ def read_csv( iterator=False, chunksize=None, # Quoting, Compression, and File Format - compression="infer", + compression: CompressionOptions = "infer", thousands=None, decimal: str = ".", lineterminator=None, @@ -716,7 +717,7 @@ def read_table( iterator=False, chunksize=None, # Quoting, Compression, and File Format - compression="infer", + compression: CompressionOptions = "infer", thousands=None, decimal: str = ".", lineterminator=None, @@ -1459,6 +1460,13 @@ def _refine_defaults_read( "delim_whitespace=True; you can only specify one." ) + if delimiter == "\n": + raise ValueError( + r"Specified \n as separator or delimiter. This forces the python engine " + "which does not accept a line terminator. Hence it is not allowed to use " + "the line terminator as separator.", + ) + if delimiter is lib.no_default: # assign default separator value kwds["delimiter"] = delim_default diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 18b2ff3837a15b..997a6bfc67dbcd 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3377,9 +3377,7 @@ def validate_multiindex( validate that we can store the multi-index; reset and return the new object """ - levels = [ - l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names) - ] + levels = com.fill_missing_names(obj.index.names) try: reset_obj = obj.reset_index() except ValueError as err: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 26869a660f4b40..548bd617a285fd 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -44,6 +44,7 @@ Series, ) from pandas.core.base import PandasObject +import pandas.core.common as com from pandas.core.tools.datetimes import to_datetime from pandas.util.version import Version @@ -1010,10 +1011,7 @@ def _index_name(self, index, index_label): ): return ["index"] else: - return [ - l if l is not None else f"level_{i}" - for i, l in enumerate(self.frame.index.names) - ] + return com.fill_missing_names(self.frame.index.names) # for reading: index=(list of) string to specify column to set as index elif isinstance(index, str): diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 3c3b4afa2c57d6..a54546a37f2845 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -105,8 +105,8 @@ def __init__( names, encoding, stylesheet, - compression, - storage_options, + compression: CompressionOptions, + storage_options: StorageOptions, ) -> None: self.path_or_buffer = path_or_buffer self.xpath = xpath @@ -570,8 +570,8 @@ def _transform_doc(self) -> bytes: def get_data_from_filepath( filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], encoding, - compression, - storage_options, + compression: CompressionOptions, + storage_options: StorageOptions, ) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: """ Extract raw XML data. @@ -666,8 +666,8 @@ def _parse( encoding, parser, stylesheet, - compression, - storage_options, + compression: CompressionOptions, + storage_options: StorageOptions, **kwargs, ) -> DataFrame: """ diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ec20bc49c8a4b9..8beacf6828a6b4 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -188,12 +188,9 @@ class TestPDApi(Base): # private modules in pandas namespace private_modules = [ "_config", - "_hashtable", - "_lib", "_libs", "_is_numpy_dev", "_testing", - "_tslib", "_typing", "_version", ] diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 55cbfaf76d5a78..01b447aa855a3c 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -107,15 +107,15 @@ def numeric_idx(request): @pytest.fixture( params=[ - pd.Timedelta("5m4s").to_pytimedelta(), - pd.Timedelta("5m4s"), - pd.Timedelta("5m4s").to_timedelta64(), + pd.Timedelta("10m7s").to_pytimedelta(), + pd.Timedelta("10m7s"), + pd.Timedelta("10m7s").to_timedelta64(), ], ids=lambda x: type(x).__name__, ) def scalar_td(request): """ - Several variants of Timedelta scalars representing 5 minutes and 4 seconds + Several variants of Timedelta scalars representing 10 minutes and 7 seconds. """ return request.param diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 49585f3d379241..8194f47541e4c7 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -19,7 +19,6 @@ from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.offsets import shift_months -from pandas.compat import np_datetime64_compat from pandas.errors import PerformanceWarning import pandas as pd @@ -229,10 +228,6 @@ def test_nat_comparisons( @pytest.mark.parametrize("dtype", [None, object]) def test_nat_comparisons_scalar(self, dtype, data, box_with_array): box = box_with_array - if box_with_array is tm.to_array and dtype is object: - # dont bother testing ndarray comparison methods as this fails - # on older numpys (since they check object identity) - return left = Series(data, dtype=dtype) left = tm.box_expected(left, box) @@ -434,10 +429,6 @@ def test_dti_cmp_datetimelike(self, other, tz_naive_fixture): @pytest.mark.parametrize("dtype", [None, object]) def test_dti_cmp_nat(self, dtype, box_with_array): - if box_with_array is tm.to_array and dtype is object: - # dont bother testing ndarray comparison methods as this fails - # on older numpys (since they check object identity) - return left = DatetimeIndex([Timestamp("2011-01-01"), NaT, Timestamp("2011-01-03")]) right = DatetimeIndex([NaT, NaT, Timestamp("2011-01-03")]) @@ -487,12 +478,12 @@ def test_dti_cmp_nat_behaves_like_float_cmp_nan(self): ) darr = np.array( [ - np_datetime64_compat("2014-02-01 00:00Z"), - np_datetime64_compat("2014-03-01 00:00Z"), - np_datetime64_compat("nat"), + np.datetime64("2014-02-01 00:00"), + np.datetime64("2014-03-01 00:00"), + np.datetime64("nat"), np.datetime64("nat"), - np_datetime64_compat("2014-06-01 00:00Z"), - np_datetime64_compat("2014-07-01 00:00Z"), + np.datetime64("2014-06-01 00:00"), + np.datetime64("2014-07-01 00:00"), ] ) @@ -823,6 +814,9 @@ def test_dt64arr_add_timedeltalike_scalar( result = rng + two_hours tm.assert_equal(result, expected) + result = two_hours + rng + tm.assert_equal(result, expected) + rng += two_hours tm.assert_equal(rng, expected) @@ -843,34 +837,6 @@ def test_dt64arr_sub_timedeltalike_scalar( rng -= two_hours tm.assert_equal(rng, expected) - # TODO: redundant with test_dt64arr_add_timedeltalike_scalar - def test_dt64arr_add_td64_scalar(self, box_with_array): - # scalar timedeltas/np.timedelta64 objects - # operate with np.timedelta64 correctly - ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) - - expected = Series( - [Timestamp("20130101 9:01:01"), Timestamp("20130101 9:02:01")] - ) - - dtarr = tm.box_expected(ser, box_with_array) - expected = tm.box_expected(expected, box_with_array) - - result = dtarr + np.timedelta64(1, "s") - tm.assert_equal(result, expected) - result = np.timedelta64(1, "s") + dtarr - tm.assert_equal(result, expected) - - expected = Series( - [Timestamp("20130101 9:01:00.005"), Timestamp("20130101 9:02:00.005")] - ) - expected = tm.box_expected(expected, box_with_array) - - result = dtarr + np.timedelta64(5, "ms") - tm.assert_equal(result, expected) - result = np.timedelta64(5, "ms") + dtarr - tm.assert_equal(result, expected) - def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture): # GH#23320 special handling for timedelta64("NaT") tz = tz_naive_fixture @@ -927,6 +893,9 @@ def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, box_with_array): Timestamp("2013-01-01"), Timestamp("2013-01-01").to_pydatetime(), Timestamp("2013-01-01").to_datetime64(), + # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano + # for DataFrame operation + np.datetime64("2013-01-01", "D"), ], ) def test_dt64arr_sub_dtscalar(self, box_with_array, ts): @@ -940,25 +909,11 @@ def test_dt64arr_sub_dtscalar(self, box_with_array, ts): result = idx - ts tm.assert_equal(result, expected) - def test_dt64arr_sub_datetime64_not_ns(self, box_with_array): - # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano - # for DataFrame operation - dt64 = np.datetime64("2013-01-01") - assert dt64.dtype == "datetime64[D]" - - dti = date_range("20130101", periods=3)._with_freq(None) - dtarr = tm.box_expected(dti, box_with_array) - - expected = TimedeltaIndex(["0 Days", "1 Day", "2 Days"]) - expected = tm.box_expected(expected, box_with_array) - - result = dtarr - dt64 - tm.assert_equal(result, expected) - - result = dt64 - dtarr + result = ts - idx + tm.assert_equal(result, -expected) tm.assert_equal(result, -expected) - def test_dt64arr_sub_timestamp(self, box_with_array): + def test_dt64arr_sub_timestamp_tzaware(self, box_with_array): ser = date_range("2014-03-17", periods=2, freq="D", tz="US/Eastern") ser = ser._with_freq(None) ts = ser[0] @@ -1033,25 +988,73 @@ def test_dt64arr_aware_sub_dt64ndarray_raises( # ------------------------------------------------------------- # Addition of datetime-like others (invalid) - def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, box_with_array): - + def test_dt64arr_add_dtlike_raises(self, tz_naive_fixture, box_with_array): + # GH#22163 ensure DataFrame doesn't cast Timestamp to i8 + # GH#9631 tz = tz_naive_fixture - dti = date_range("2016-01-01", periods=3, tz=tz) - dt64vals = dti.values + dti = date_range("2016-01-01", periods=3, tz=tz) + if tz is None: + dti2 = dti.tz_localize("US/Eastern") + else: + dti2 = dti.tz_localize(None) dtarr = tm.box_expected(dti, box_with_array) - assert_cannot_add(dtarr, dt64vals) - def test_dt64arr_add_timestamp_raises(self, box_with_array): - # GH#22163 ensure DataFrame doesn't cast Timestamp to i8 - idx = DatetimeIndex(["2011-01-01", "2011-01-02"]) - ts = idx[0] - idx = tm.box_expected(idx, box_with_array) - assert_cannot_add(idx, ts) + assert_cannot_add(dtarr, dti.values) + assert_cannot_add(dtarr, dti) + assert_cannot_add(dtarr, dtarr) + assert_cannot_add(dtarr, dti[0]) + assert_cannot_add(dtarr, dti[0].to_pydatetime()) + assert_cannot_add(dtarr, dti[0].to_datetime64()) + assert_cannot_add(dtarr, dti2[0]) + assert_cannot_add(dtarr, dti2[0].to_pydatetime()) + assert_cannot_add(dtarr, np.datetime64("2011-01-01", "D")) # ------------------------------------------------------------- # Other Invalid Addition/Subtraction + # Note: freq here includes both Tick and non-Tick offsets; this is + # relevant because historically integer-addition was allowed if we had + # a freq. + @pytest.mark.parametrize("freq", ["H", "D", "W", "M", "MS", "Q", "B", None]) + @pytest.mark.parametrize("dtype", [None, "uint8"]) + def test_dt64arr_addsub_intlike( + self, dtype, box_with_array, freq, tz_naive_fixture + ): + # GH#19959, GH#19123, GH#19012 + tz = tz_naive_fixture + if box_with_array is pd.DataFrame: + # alignment headaches + return + + if freq is None: + dti = DatetimeIndex(["NaT", "2017-04-05 06:07:08"], tz=tz) + else: + dti = date_range("2016-01-01", periods=2, freq=freq, tz=tz) + + obj = box_with_array(dti) + other = np.array([4, -1], dtype=dtype) + + msg = "|".join( + [ + "Addition/subtraction of integers", + "cannot subtract DatetimeArray from", + # IntegerArray + "can only perform ops with numeric values", + "unsupported operand type.*Categorical", + ] + ) + assert_invalid_addsub_type(obj, 1, msg) + assert_invalid_addsub_type(obj, np.int64(2), msg) + assert_invalid_addsub_type(obj, np.array(3, dtype=np.int64), msg) + assert_invalid_addsub_type(obj, other, msg) + assert_invalid_addsub_type(obj, np.array(other), msg) + assert_invalid_addsub_type(obj, pd.array(other), msg) + assert_invalid_addsub_type(obj, pd.Categorical(other), msg) + assert_invalid_addsub_type(obj, pd.Index(other), msg) + assert_invalid_addsub_type(obj, pd.core.indexes.api.NumericIndex(other), msg) + assert_invalid_addsub_type(obj, Series(other), msg) + @pytest.mark.parametrize( "other", [ @@ -1110,48 +1113,49 @@ def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixtu obj1 = tm.box_expected(obj1, box_with_array) obj2 = tm.box_expected(obj2, box_with_array) + msg = "|".join( + [ + "unsupported operand", + "cannot subtract DatetimeArray from ndarray", + ] + ) + with warnings.catch_warnings(record=True): # pandas.errors.PerformanceWarning: Non-vectorized DateOffset being # applied to Series or DatetimeIndex # we aren't testing that here, so ignore. warnings.simplefilter("ignore", PerformanceWarning) - # If `x + y` raises, then `y + x` should raise here as well + assert_invalid_addsub_type(obj1, obj2, msg=msg) - msg = ( - r"unsupported operand type\(s\) for -: " - "'(Timestamp|DatetimeArray)' and 'datetime.time'" - ) - with pytest.raises(TypeError, match=msg): - obj1 - obj2 + # ------------------------------------------------------------- + # Other invalid operations - msg = "|".join( - [ - "cannot subtract DatetimeArray from ndarray", - "ufunc (subtract|'subtract') cannot use operands with types " - r"dtype\('O'\) and dtype\('_replace + result = cat.replace(to_replace, value) + tm.assert_categorical_equal(result, expected) if to_replace == "b": # the "c" test is supposed to be unchanged with pytest.raises(AssertionError, match=expected_error_msg): # ensure non-inplace call does not affect original tm.assert_categorical_equal(cat, expected) - cat.replace(to_replace, value, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="Series.replace"): + # GH#44929 replace->_replace + cat.replace(to_replace, value, inplace=True) + tm.assert_categorical_equal(cat, expected) diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 7749e138ccbea4..169b23c31f863c 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -61,6 +61,7 @@ def test_floating_array_disallows_float16(request): ): # the locale condition may need to be refined; this fails on # the CI in the ZH_CN build + # https://github.com/numpy/numpy/issues/20512 mark = pytest.mark.xfail(reason="numpy does not raise on np.dtype('Float16')") request.node.add_marker(mark) diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 560299a4a47f5d..6066d49b684892 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,5 +1,7 @@ import pytest +from pandas.compat import pa_version_under2p0 + from pandas.core.dtypes.dtypes import PeriodDtype import pandas as pd @@ -69,6 +71,9 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) +@pytest.mark.xfail( + pa_version_under2p0, reason="pyarrow incorrectly uses pandas internals API" +) def test_arrow_table_roundtrip(): from pandas.core.arrays._arrow_utils import ArrowPeriodType @@ -88,6 +93,9 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail( + pa_version_under2p0, reason="pyarrow incorrectly uses pandas internals API" +) def test_arrow_load_from_zero_chunks(): # GH-41040 @@ -106,6 +114,9 @@ def test_arrow_load_from_zero_chunks(): tm.assert_frame_equal(result, df) +@pytest.mark.xfail( + pa_version_under2p0, reason="pyarrow incorrectly uses pandas internals API" +) def test_arrow_table_roundtrip_without_metadata(): arr = PeriodArray([1, 2, 3], freq="H") arr[1] = pd.NaT diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index d7c39c0e0708eb..012fe61fdba05f 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -34,26 +34,23 @@ class TestSparseArrayArithmetics: def _assert(self, a, b): tm.assert_numpy_array_equal(a, b) - def _check_numeric_ops(self, a, b, a_dense, b_dense, mix, op): + def _check_numeric_ops(self, a, b, a_dense, b_dense, mix: bool, op): + # Check that arithmetic behavior matches non-Sparse Series arithmetic + + if isinstance(a_dense, np.ndarray): + expected = op(pd.Series(a_dense), b_dense).values + elif isinstance(b_dense, np.ndarray): + expected = op(a_dense, pd.Series(b_dense)).values + else: + raise NotImplementedError + with np.errstate(invalid="ignore", divide="ignore"): if mix: result = op(a, b_dense).to_dense() else: result = op(a, b).to_dense() - if op in [operator.truediv, ops.rtruediv]: - # pandas uses future division - expected = op(a_dense * 1.0, b_dense) - else: - expected = op(a_dense, b_dense) - - if op in [operator.floordiv, ops.rfloordiv]: - # Series sets 1//0 to np.inf, which SparseArray does not do (yet) - mask = np.isinf(expected) - if mask.any(): - expected[mask] = np.nan - - self._assert(result, expected) + self._assert(result, expected) def _check_bool_result(self, res): assert isinstance(res, self._klass) @@ -125,7 +122,7 @@ def test_float_scalar( ): op = all_arithmetic_functions - if not np_version_under1p20: + if np_version_under1p20: if op in [operator.floordiv, ops.rfloordiv]: if op is operator.floordiv and scalar != 0: pass @@ -158,9 +155,7 @@ def test_float_scalar_comparison(self, kind): self._check_comparison_ops(a, 0, values, 0) self._check_comparison_ops(a, 3, values, 3) - def test_float_same_index_without_nans( - self, kind, mix, all_arithmetic_functions, request - ): + def test_float_same_index_without_nans(self, kind, mix, all_arithmetic_functions): # when sp_index are the same op = all_arithmetic_functions @@ -178,13 +173,12 @@ def test_float_same_index_with_nans( op = all_arithmetic_functions if ( - not np_version_under1p20 + np_version_under1p20 and op is ops.rfloordiv and not (mix and kind == "block") ): mark = pytest.mark.xfail(raises=AssertionError, reason="GH#38172") request.node.add_marker(mark) - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) @@ -360,11 +354,7 @@ def test_bool_array_logical(self, kind, fill_value): def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request): op = all_arithmetic_functions - if ( - not np_version_under1p20 - and op in [operator.floordiv, ops.rfloordiv] - and mix - ): + if np_version_under1p20 and op in [operator.floordiv, ops.rfloordiv] and mix: mark = pytest.mark.xfail(raises=AssertionError, reason="GH#38172") request.node.add_marker(mark) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index ec91ab86ad3fa1..596860e75f0e5e 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1446,11 +1446,21 @@ class TestMinMax: ], ) def test_nan_fill_value(self, raw_data, max_expected, min_expected): - max_result = SparseArray(raw_data).max() - min_result = SparseArray(raw_data).min() + arr = SparseArray(raw_data) + max_result = arr.max() + min_result = arr.min() assert max_result in max_expected assert min_result in min_expected + max_result = arr.max(skipna=False) + min_result = arr.min(skipna=False) + if np.isnan(raw_data).any(): + assert np.isnan(max_result) + assert np.isnan(min_result) + else: + assert max_result in max_expected + assert min_result in min_expected + @pytest.mark.parametrize( "fill_value,max_expected,min_expected", [ @@ -1468,6 +1478,16 @@ def test_fill_value(self, fill_value, max_expected, min_expected): min_result = arr.min() assert min_result == min_expected + def test_only_fill_value(self): + fv = 100 + arr = SparseArray(np.array([fv, fv, fv]), dtype=SparseDtype("int", fv)) + assert len(arr._valid_sp_values) == 0 + + assert arr.max() == fv + assert arr.min() == fv + assert arr.max(skipna=False) == fv + assert arr.min(skipna=False) == fv + @pytest.mark.parametrize("func", ["min", "max"]) @pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])]) @pytest.mark.parametrize( @@ -1482,7 +1502,8 @@ def test_fill_value(self, fill_value, max_expected, min_expected): def test_na_value_if_no_valid_values(self, func, data, dtype, expected): arr = SparseArray(data, dtype=dtype) result = getattr(arr, func)() - if expected == pd.NaT: - assert result == pd.NaT + if expected is pd.NaT: + # TODO: pin down whether we wrap datetime64("NaT") + assert result is pd.NaT or np.isnat(result) else: assert np.isnan(result) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 722222aab6d27b..7c3a8c691b7866 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -217,15 +217,18 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): tm.assert_extension_array_equal(result, expected) -def test_comparison_methods_scalar_not_string(comparison_op, dtype, request): +def test_comparison_methods_scalar_not_string(comparison_op, dtype): op_name = f"__{comparison_op.__name__}__" - if op_name not in ["__eq__", "__ne__"]: - reason = "comparison op not supported between instances of 'str' and 'int'" - mark = pytest.mark.xfail(raises=TypeError, reason=reason) - request.node.add_marker(mark) a = pd.array(["a", None, "c"], dtype=dtype) other = 42 + + if op_name not in ["__eq__", "__ne__"]: + with pytest.raises(TypeError, match="not supported between"): + getattr(a, op_name)(other) + + return + result = getattr(a, op_name)(other) expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ op_name @@ -234,12 +237,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype, request): tm.assert_extension_array_equal(result, expected) -def test_comparison_methods_array(comparison_op, dtype, request): - if dtype.storage == "pyarrow": - mark = pytest.mark.xfail( - raises=AssertionError, reason="left is not an ExtensionArray" - ) - request.node.add_marker(mark) +def test_comparison_methods_array(comparison_op, dtype): op_name = f"__{comparison_op.__name__}__" @@ -340,6 +338,17 @@ def test_reduce(skipna, dtype): assert result == "abc" +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.xfail(reason="Not implemented StringArray.sum") +def test_reduce_missing(skipna, dtype): + arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) + result = arr.sum(skipna=skipna) + if skipna: + assert result == "abc" + else: + assert pd.isna(result) + + @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) def test_min_max(method, skipna, dtype, request): @@ -374,17 +383,6 @@ def test_min_max_numpy(method, box, dtype, request): assert result == expected -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented StringArray.sum") -def test_reduce_missing(skipna, dtype): - arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) - result = arr.sum(skipna=skipna) - if skipna: - assert result == "abc" - else: - assert pd.isna(result) - - def test_fillna_args(dtype, request): # GH 37987 diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index a5d622b78ff39c..7484fdccf49371 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -814,7 +814,7 @@ def test_to_period_2d(self, arr1d): expected = arr1d.to_period("D").reshape(1, -1) tm.assert_period_array_equal(result, expected) - @pytest.mark.parametrize("propname", DatetimeIndex._bool_ops) + @pytest.mark.parametrize("propname", DatetimeArray._bool_ops) def test_bool_properties(self, arr1d, propname): # in this case _bool_ops is just `is_leap_year` dti = self.index_cls(arr1d) @@ -826,16 +826,20 @@ def test_bool_properties(self, arr1d, propname): tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("propname", DatetimeIndex._field_ops) + @pytest.mark.parametrize("propname", DatetimeArray._field_ops) def test_int_properties(self, arr1d, propname): + warn = None + msg = "weekofyear and week have been deprecated, please use" if propname in ["week", "weekofyear"]: # GH#33595 Deprecate week and weekofyear - return + warn = FutureWarning + dti = self.index_cls(arr1d) arr = arr1d - result = getattr(arr, propname) - expected = np.array(getattr(dti, propname), dtype=result.dtype) + with tm.assert_produces_warning(warn, match=msg): + result = getattr(arr, propname) + expected = np.array(getattr(dti, propname), dtype=result.dtype) tm.assert_numpy_array_equal(result, expected) @@ -979,7 +983,7 @@ def test_total_seconds(self, timedelta_index): tm.assert_numpy_array_equal(result, expected.values) - @pytest.mark.parametrize("propname", TimedeltaIndex._field_ops) + @pytest.mark.parametrize("propname", TimedeltaArray._field_ops) def test_int_properties(self, timedelta_index, propname): tdi = timedelta_index arr = TimedeltaArray(tdi) diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 9b9945495a7331..66f7bf1f4d743c 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -194,6 +194,38 @@ def test_validate_reduction_keyword_args(): arr.all(keepdims=True) +def test_np_max_nested_tuples(): + # case where checking in ufunc.nout works while checking for tuples + # does not + vals = [ + (("j", "k"), ("l", "m")), + (("l", "m"), ("o", "p")), + (("o", "p"), ("j", "k")), + ] + ser = pd.Series(vals) + arr = ser.array + + assert arr.max() is arr[2] + assert ser.max() is arr[2] + + result = np.maximum.reduce(arr) + assert result == arr[2] + + result = np.maximum.reduce(ser) + assert result == arr[2] + + +def test_np_reduce_2d(): + raw = np.arange(12).reshape(4, 3) + arr = PandasArray(raw) + + res = np.maximum.reduce(arr, axis=0) + tm.assert_extension_array_equal(res, arr[-1]) + + alt = arr.max(axis=0) + tm.assert_extension_array_equal(alt, arr[-1]) + + # ---------------------------------------------------------------------------- # Ops diff --git a/pandas/tests/base/test_fillna.py b/pandas/tests/base/test_fillna.py index 32c9d288e665db..7300d3013305a7 100644 --- a/pandas/tests/base/test_fillna.py +++ b/pandas/tests/base/test_fillna.py @@ -6,9 +6,7 @@ import numpy as np import pytest -from pandas.core.dtypes.generic import ABCMultiIndex - -from pandas import Index +from pandas import MultiIndex import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -16,16 +14,18 @@ def test_fillna(index_or_series_obj): # GH 11343 obj = index_or_series_obj - if isinstance(obj, ABCMultiIndex): - pytest.skip("MultiIndex doesn't support isna") + + if isinstance(obj, MultiIndex): + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + obj.fillna(0) + return # values will not be changed fill_value = obj.values[0] if len(obj) > 0 else 0 result = obj.fillna(fill_value) - if isinstance(obj, Index): - tm.assert_index_equal(obj, result) - else: - tm.assert_series_equal(obj, result) + + tm.assert_equal(obj, result) # check shallow_copied assert obj is not result @@ -41,7 +41,7 @@ def test_fillna_null(null_obj, index_or_series_obj): pytest.skip(f"{klass} doesn't allow for NA operations") elif len(obj) < 1: pytest.skip("Test doesn't make sense on empty data") - elif isinstance(obj, ABCMultiIndex): + elif isinstance(obj, MultiIndex): pytest.skip(f"MultiIndex can't hold '{null_obj}'") values = obj._values diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index aaaec46399fa86..8372ec92ec26eb 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -18,6 +18,7 @@ Index, Series, ) +import pandas._testing as tm @pytest.mark.parametrize( @@ -109,8 +110,9 @@ def test_memory_usage_components_series(series_with_simple_index): assert total_usage == non_index_usage + index_usage -def test_memory_usage_components_narrow_series(narrow_series): - series = narrow_series +@pytest.mark.parametrize("dtype", tm.NARROW_NP_DTYPES) +def test_memory_usage_components_narrow_series(dtype): + series = tm.makeFloatSeries(name="a").astype(dtype) total_usage = series.memory_usage(index=True) non_index_usage = series.memory_usage(index=False) index_usage = series.index.memory_usage() diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 31f2aebcba4ba2..59c1c61a4c8cc8 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -123,6 +123,6 @@ def test_unique_bad_unicode(index_or_series): @pytest.mark.parametrize("dropna", [True, False]) def test_nunique_dropna(dropna): # GH37566 - s = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT]) - res = s.nunique(dropna) + ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT]) + res = ser.nunique(dropna) assert res == 1 if dropna else 5 diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 2970c973b187b9..6130646bb52c51 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas.compat import np_array_datetime64_compat - import pandas as pd from pandas import ( DatetimeIndex, @@ -212,7 +210,7 @@ def test_value_counts_datetime64(index_or_series): expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) - expected = np_array_datetime64_compat( + expected = np.array( ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], dtype="datetime64[ns]", ) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index be5cb81506efd4..e2c5f893b6a2c7 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -20,6 +20,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.types import pandas_dtype from pandas.arrays import SparseArray @@ -115,6 +116,7 @@ def test_period_dtype(self, dtype): "float": np.dtype(np.float64), "object": np.dtype(object), "category": com.pandas_dtype("category"), + "string": pd.StringDtype(), } @@ -128,6 +130,12 @@ def test_dtype_equal(name1, dtype1, name2, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) +@pytest.mark.parametrize("name,dtype", list(dtypes.items()), ids=lambda x: str(x)) +def test_pyarrow_string_import_error(name, dtype): + # GH-44276 + assert not com.is_dtype_equal(dtype, "string[pyarrow]") + + @pytest.mark.parametrize( "dtype1,dtype2", [ @@ -400,6 +408,23 @@ def test_is_int64_dtype(dtype): assert com.is_int64_dtype(dtype) +def test_type_comparison_with_numeric_ea_dtype(any_numeric_ea_dtype): + # GH#43038 + assert pandas_dtype(any_numeric_ea_dtype) == any_numeric_ea_dtype + + +def test_type_comparison_with_real_numpy_dtype(any_real_numpy_dtype): + # GH#43038 + assert pandas_dtype(any_real_numpy_dtype) == any_real_numpy_dtype + + +def test_type_comparison_with_signed_int_ea_dtype_and_signed_int_numpy_dtype( + any_signed_int_ea_dtype, any_signed_int_numpy_dtype +): + # GH#43038 + assert not pandas_dtype(any_signed_int_ea_dtype) == any_signed_int_numpy_dtype + + @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 5936248456ca74..7953d650636be1 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -75,6 +75,56 @@ def coerce(request): return request.param +class MockNumpyLikeArray: + """ + A class which is numpy-like (e.g. Pint's Quantity) but not actually numpy + + The key is that it is not actually a numpy array so + ``util.is_array(mock_numpy_like_array_instance)`` returns ``False``. Other + important properties are that the class defines a :meth:`__iter__` method + (so that ``isinstance(abc.Iterable)`` returns ``True``) and has a + :meth:`ndim` property, as pandas special-cases 0-dimensional arrays in some + cases. + + We expect pandas to behave with respect to such duck arrays exactly as + with real numpy arrays. In particular, a 0-dimensional duck array is *NOT* + a scalar (`is_scalar(np.array(1)) == False`), but it is not list-like either. + """ + + def __init__(self, values): + self._values = values + + def __iter__(self): + iter_values = iter(self._values) + + def it_outer(): + yield from iter_values + + return it_outer() + + def __len__(self): + return len(self._values) + + def __array__(self, t=None): + return np.asarray(self._values, dtype=t) + + @property + def ndim(self): + return self._values.ndim + + @property + def dtype(self): + return self._values.dtype + + @property + def size(self): + return self._values.size + + @property + def shape(self): + return self._values.shape + + # collect all objects to be tested for list-like-ness; use tuples of objects, # whether they are list-like or not (special casing for sets), and their ID ll_params = [ @@ -109,6 +159,15 @@ def coerce(request): (np.ndarray((2,) * 4), True, "ndarray-4d"), (np.array([[[[]]]]), True, "ndarray-4d-empty"), (np.array(2), False, "ndarray-0d"), + (MockNumpyLikeArray(np.ndarray((2,) * 1)), True, "duck-ndarray-1d"), + (MockNumpyLikeArray(np.array([])), True, "duck-ndarray-1d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 2)), True, "duck-ndarray-2d"), + (MockNumpyLikeArray(np.array([[]])), True, "duck-ndarray-2d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 3)), True, "duck-ndarray-3d"), + (MockNumpyLikeArray(np.array([[[]]])), True, "duck-ndarray-3d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 4)), True, "duck-ndarray-4d"), + (MockNumpyLikeArray(np.array([[[[]]]])), True, "duck-ndarray-4d-empty"), + (MockNumpyLikeArray(np.array(2)), False, "duck-ndarray-0d"), (1, False, "int"), (b"123", False, "bytes"), (b"", False, "bytes-empty"), @@ -181,6 +240,8 @@ def test_is_array_like(): assert inference.is_array_like(Series([1, 2])) assert inference.is_array_like(np.array(["a", "b"])) assert inference.is_array_like(Index(["2016-01-01"])) + assert inference.is_array_like(np.array([2, 3])) + assert inference.is_array_like(MockNumpyLikeArray(np.array([2, 3]))) class DtypeList(list): dtype = "special" @@ -1811,9 +1872,13 @@ def test_is_scalar_numpy_zerodim_arrays(self): @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_is_scalar_numpy_arrays(self): - assert not is_scalar(np.array([])) - assert not is_scalar(np.array([[]])) - assert not is_scalar(np.matrix("1; 2")) + for a in [ + np.array([]), + np.array([[]]), + np.matrix("1; 2"), + ]: + assert not is_scalar(a) + assert not is_scalar(MockNumpyLikeArray(a)) def test_is_scalar_pandas_scalars(self): assert is_scalar(Timestamp("2014-01-01")) diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index 1a330bb584ba55..fad28c1896ad00 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -26,6 +26,7 @@ ) from pandas.api.types import is_scalar from pandas.core.arraylike import OpsMixin +from pandas.core.construction import extract_array @register_extension_dtype @@ -77,6 +78,16 @@ class ArrowExtensionArray(OpsMixin, ExtensionArray): @classmethod def from_scalars(cls, values): + if isinstance(values, cls): + # in particular for empty cases the pa.array(np.asarray(...)) + # does not round-trip + return cls(values._data) + + elif not len(values): + if isinstance(values, list): + dtype = bool if cls is ArrowBoolArray else str + values = np.array([], dtype=dtype) + arr = pa.chunked_array([pa.array(np.asarray(values))]) return cls(arr) @@ -92,6 +103,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): def __repr__(self): return f"{type(self).__name__}({repr(self._data)})" + def __contains__(self, obj) -> bool: + if obj is None or obj is self.dtype.na_value: + # None -> EA.__contains__ only checks for self._dtype.na_value, not + # any compatible NA value. + # self.dtype.na_value -> isn't recognized by pd.isna + return bool(self.isna().any()) + return bool(super().__contains__(obj)) + def __getitem__(self, item): if is_scalar(item): return self._data.to_pandas()[item] @@ -125,7 +144,8 @@ def _logical_method(self, other, op): def __eq__(self, other): if not isinstance(other, type(self)): - return False + # TODO: use some pyarrow function here? + return np.asarray(self).__eq__(other) return self._logical_method(other, operator.eq) @@ -144,6 +164,7 @@ def isna(self): def take(self, indices, allow_fill=False, fill_value=None): data = self._data.to_pandas() + data = extract_array(data, extract_numpy=True) if allow_fill and fill_value is None: fill_value = self.dtype.na_value diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 9564239f119f37..a73684868e3aec 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -54,8 +54,8 @@ def test_view(self, data): data.view() @pytest.mark.xfail( - raises=AttributeError, - reason="__eq__ incorrectly returns bool instead of ndarray[bool]", + raises=AssertionError, + reason="Doesn't recognize data._na_value as NA", ) def test_contains(self, data, data_missing): super().test_contains(data, data_missing) @@ -77,7 +77,7 @@ def test_series_constructor_scalar_na_with_index(self, dtype, na_value): # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays super().test_series_constructor_scalar_na_with_index(dtype, na_value) - @pytest.mark.xfail(reason="raises AssertionError") + @pytest.mark.xfail(reason="ufunc 'invert' not supported for the input types") def test_construct_empty_dataframe(self, dtype): super().test_construct_empty_dataframe(dtype) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index a86c07c6043208..a201366152c2f3 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -5,12 +5,9 @@ import pytest from pandas._libs.missing import is_matching_na -from pandas.compat import ( - IS64, - is_platform_windows, -) import pandas as pd +from pandas.core.arrays.integer import INT_STR_TO_DTYPE from pandas.tests.extension.base.base import BaseExtensionTests @@ -153,10 +150,7 @@ def test_fillna_2d_method(self, data_missing, method): self.assert_extension_array_equal(result, expected) @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) - def test_reductions_2d_axis_none(self, data, method, request): - if not hasattr(data, method): - pytest.skip("test is not applicable for this type/dtype") - + def test_reductions_2d_axis_none(self, data, method): arr2d = data.reshape(1, -1) err_expected = None @@ -181,10 +175,7 @@ def test_reductions_2d_axis_none(self, data, method, request): assert is_matching_na(result, expected) or result == expected @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) - def test_reductions_2d_axis0(self, data, method, request): - if not hasattr(data, method): - pytest.skip("test is not applicable for this type/dtype") - + def test_reductions_2d_axis0(self, data, method): arr2d = data.reshape(1, -1) kwargs = {} @@ -203,32 +194,29 @@ def test_reductions_2d_axis0(self, data, method, request): else: raise AssertionError("Both reductions should raise or neither") + def get_reduction_result_dtype(dtype): + # windows and 32bit builds will in some cases have int32/uint32 + # where other builds will have int64/uint64. + if dtype.itemsize == 8: + return dtype + elif dtype.kind in "ib": + return INT_STR_TO_DTYPE[np.dtype(int).name] + else: + # i.e. dtype.kind == "u" + return INT_STR_TO_DTYPE[np.dtype(np.uint).name] + if method in ["mean", "median", "sum", "prod"]: # std and var are not dtype-preserving expected = data - if method in ["sum", "prod"] and data.dtype.kind in ["i", "u"]: - # FIXME: kludge - if data.dtype.kind == "i": - if is_platform_windows() or not IS64: - # FIXME: kludge for 32bit builds - if result.dtype.itemsize == 4: - dtype = pd.Int32Dtype() - else: - dtype = pd.Int64Dtype() - else: - dtype = pd.Int64Dtype() - else: - if is_platform_windows() or not IS64: - # FIXME: kludge for 32bit builds - if result.dtype.itemsize == 4: - dtype = pd.UInt32Dtype() - else: - dtype = pd.UInt64Dtype() - else: - dtype = pd.UInt64Dtype() + if method in ["sum", "prod"] and data.dtype.kind in "iub": + dtype = get_reduction_result_dtype(data.dtype) expected = data.astype(dtype) - assert type(expected) == type(data), type(expected) + if data.dtype.kind == "b" and method in ["sum", "prod"]: + # We get IntegerArray instead of BooleanArray + pass + else: + assert type(expected) == type(data), type(expected) assert dtype == expected.dtype self.assert_extension_array_equal(result, expected) @@ -237,10 +225,7 @@ def test_reductions_2d_axis0(self, data, method, request): # punt on method == "var" @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) - def test_reductions_2d_axis1(self, data, method, request): - if not hasattr(data, method): - pytest.skip("test is not applicable for this type/dtype") - + def test_reductions_2d_axis1(self, data, method): arr2d = data.reshape(1, -1) try: diff --git a/pandas/tests/extension/date/__init__.py b/pandas/tests/extension/date/__init__.py new file mode 100644 index 00000000000000..2a8c7e9f57a5da --- /dev/null +++ b/pandas/tests/extension/date/__init__.py @@ -0,0 +1,6 @@ +from pandas.tests.extension.date.array import ( + DateArray, + DateDtype, +) + +__all__ = ["DateArray", "DateDtype"] diff --git a/pandas/tests/extension/date/array.py b/pandas/tests/extension/date/array.py new file mode 100644 index 00000000000000..d29ed293e71ed5 --- /dev/null +++ b/pandas/tests/extension/date/array.py @@ -0,0 +1,180 @@ +import datetime as dt +from typing import ( + Any, + Optional, + Sequence, + Tuple, + Union, + cast, +) + +import numpy as np + +from pandas._typing import ( + Dtype, + PositionalIndexer, +) + +from pandas.core.dtypes.dtypes import register_extension_dtype + +from pandas.api.extensions import ( + ExtensionArray, + ExtensionDtype, +) +from pandas.api.types import pandas_dtype + + +@register_extension_dtype +class DateDtype(ExtensionDtype): + @property + def type(self): + return dt.date + + @property + def name(self): + return "DateDtype" + + @classmethod + def construct_from_string(cls, string: str): + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + + if string == cls.__name__: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + @classmethod + def construct_array_type(cls): + return DateArray + + @property + def na_value(self): + return dt.date.min + + def __repr__(self) -> str: + return self.name + + +class DateArray(ExtensionArray): + def __init__( + self, + dates: Union[ + dt.date, + Sequence[dt.date], + Tuple[np.ndarray, np.ndarray, np.ndarray], + np.ndarray, + ], + ) -> None: + if isinstance(dates, dt.date): + self._year = np.array([dates.year]) + self._month = np.array([dates.month]) + self._day = np.array([dates.year]) + return + + ldates = len(dates) + if isinstance(dates, list): + # pre-allocate the arrays since we know the size before hand + self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) + self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) + self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) + # populate them + for i, (y, m, d) in enumerate( + map(lambda date: (date.year, date.month, date.day), dates) + ): + self._year[i] = y + self._month[i] = m + self._day[i] = d + + elif isinstance(dates, tuple): + # only support triples + if ldates != 3: + raise ValueError("only triples are valid") + # check if all elements have the same type + if any(map(lambda x: not isinstance(x, np.ndarray), dates)): + raise TypeError("invalid type") + ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates) + if not ly == lm == ld: + raise ValueError( + f"tuple members must have the same length: {(ly, lm, ld)}" + ) + self._year = dates[0].astype(np.uint16) + self._month = dates[1].astype(np.uint8) + self._day = dates[2].astype(np.uint8) + + elif isinstance(dates, np.ndarray) and dates.dtype == "U10": + self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) + self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) + self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) + + for (i,), (y, m, d) in np.ndenumerate(np.char.split(dates, sep="-")): + self._year[i] = int(y) + self._month[i] = int(m) + self._day[i] = int(d) + + else: + raise TypeError(f"{type(dates)} is not supported") + + @property + def dtype(self) -> ExtensionDtype: + return DateDtype() + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + + if isinstance(dtype, DateDtype): + data = self.copy() if copy else self + else: + data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min) + + return data + + @property + def nbytes(self) -> int: + return self._year.nbytes + self._month.nbytes + self._day.nbytes + + def __len__(self) -> int: + return len(self._year) # all 3 arrays are enforced to have the same length + + def __getitem__(self, item: PositionalIndexer): + if isinstance(item, int): + return dt.date(self._year[item], self._month[item], self._day[item]) + else: + raise NotImplementedError("only ints are supported as indexes") + + def __setitem__(self, key: Union[int, slice, np.ndarray], value: Any): + if not isinstance(key, int): + raise NotImplementedError("only ints are supported as indexes") + + if not isinstance(value, dt.date): + raise TypeError("you can only set datetime.date types") + + self._year[key] = value.year + self._month[key] = value.month + self._day[key] = value.day + + def __repr__(self) -> str: + return f"DateArray{list(zip(self._year, self._month, self._day))}" + + def copy(self) -> "DateArray": + return DateArray((self._year.copy(), self._month.copy(), self._day.copy())) + + def isna(self) -> np.ndarray: + return np.logical_and( + np.logical_and( + self._year == dt.date.min.year, self._month == dt.date.min.month + ), + self._day == dt.date.min.day, + ) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): + if isinstance(scalars, dt.date): + pass + elif isinstance(scalars, DateArray): + pass + elif isinstance(scalars, np.ndarray): + scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd + return DateArray(scalars) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index fe7ebe4f4fb51c..e58e26fafdc1b0 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -67,8 +67,11 @@ class DecimalArray(OpsMixin, ExtensionScalarOpsMixin, ExtensionArray): def __init__(self, values, dtype=None, copy=False, context=None): for i, val in enumerate(values): - if is_float(val) and np.isnan(val): - values[i] = DecimalDtype.na_value + if is_float(val): + if np.isnan(val): + values[i] = DecimalDtype.na_value + else: + values[i] = DecimalDtype.type(val) elif not isinstance(val, decimal.Decimal): raise TypeError("All values must be of type " + str(decimal.Decimal)) values = np.asarray(values, dtype=object) @@ -124,7 +127,7 @@ def reconstruct(x): else: return DecimalArray._from_sequence(x) - if isinstance(result, tuple): + if ufunc.nout > 1: return tuple(reconstruct(x) for x in result) else: return reconstruct(result) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 8a2da2779df543..f7809dc2e42175 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -427,6 +427,10 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): request.node.add_marker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) + def _check_divmod_op(self, ser, op, other, exc=NotImplementedError): + # We implement divmod + super()._check_divmod_op(ser, op, other, exc=None) + class TestComparisonOps(BaseSparseTests, base.BaseComparisonOpsTests): def _compare_other(self, s, data, comparison_op, other): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 5049116a9320e2..d9351add0fe6de 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -146,9 +146,9 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): if op_name in ["min", "max"]: return None - s = pd.Series(data) + ser = pd.Series(data) with pytest.raises(TypeError): - getattr(s, op_name)(skipna=skipna) + getattr(ser, op_name)(skipna=skipna) class TestMethods(base.BaseMethodsTests): @@ -166,15 +166,15 @@ class TestCasting(base.BaseCastingTests): class TestComparisonOps(base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op, other): + def _compare_other(self, ser, data, op, other): op_name = f"__{op.__name__}__" - result = getattr(s, op_name)(other) - expected = getattr(s.astype(object), op_name)(other).astype("boolean") + result = getattr(ser, op_name)(other) + expected = getattr(ser.astype(object), op_name)(other).astype("boolean") self.assert_series_equal(result, expected) def test_compare_scalar(self, data, comparison_op): - s = pd.Series(data) - self._compare_other(s, data, comparison_op, "abc") + ser = pd.Series(data) + self._compare_other(ser, data, comparison_op, "abc") class TestParsing(base.BaseParsingTests): diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index abb70089f1fef6..4aa150afadef60 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -33,10 +33,7 @@ def test_from_records_with_datetimes(self): arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] dtypes = [("EXPIRY", " 0.0] if observed else expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "gender", "education"], + ), + ) + for i in range(3): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "normalize, expected_label, expected_values", + [ + (False, "count", [1, 1, 1]), + (True, "proportion", [0.5, 0.5, 1.0]), + ], +) +def test_mixed_groupings(normalize, expected_label, expected_values): + # Test multiple groupings + df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False) + result = gp.value_counts(sort=True, normalize=normalize) + expected = DataFrame( + { + "level_0": [4, 4, 5], + "A": [1, 1, 2], + "level_2": [8, 8, 7], + "B": [1, 3, 2], + expected_label: expected_values, + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "test, expected_names", + [ + ("repeat", ["a", None, "d", "b", "b", "e"]), + ("level", ["a", None, "d", "b", "c", "level_1"]), + ], +) +@pytest.mark.parametrize("as_index", [False, True]) +def test_column_name_clashes(test, expected_names, as_index): + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]}) + if test == "repeat": + df.columns = list("abbde") + else: + df.columns = list("abcd") + ["level_1"] + + if as_index: + result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() + expected = Series( + data=(1, 1), + index=MultiIndex.from_tuples( + [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)], + names=expected_names, + ), + ) + tm.assert_series_equal(result, expected) + else: + with pytest.raises(ValueError, match="cannot insert"): + df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() + + +def test_ambiguous_grouping(): + # Test that groupby is not confused by groupings length equal to row count + df = DataFrame({"a": [1, 1]}) + gb = df.groupby([1, 1]) + result = gb.value_counts() + expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"])) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_indexing.py b/pandas/tests/groupby/test_indexing.py index b9f71fd4ed96a3..aea659445801bf 100644 --- a/pandas/tests/groupby/test_indexing.py +++ b/pandas/tests/groupby/test_indexing.py @@ -2,6 +2,7 @@ import random +import numpy as np import pytest import pandas as pd @@ -285,3 +286,20 @@ def test_column_axis(column_group_df): expected = column_group_df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array]) +def test_groupby_duplicated_columns(func): + # GH#44924 + df = pd.DataFrame( + { + "A": [1, 2], + "B": [3, 3], + "C": ["G", "G"], + } + ) + result = df.groupby("C")[func(["A", "B", "A"])].mean() + expected = pd.DataFrame( + [[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C") + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index 20fd02b21a744c..6554993c140a1b 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -13,39 +13,61 @@ @pytest.mark.filterwarnings("ignore:\n") # Filter warnings when parallel=True and the function can't be parallelized by Numba class TestEngine: - def test_cython_vs_numba_frame(self, sort, nogil, parallel, nopython): + def test_cython_vs_numba_frame( + self, sort, nogil, parallel, nopython, numba_supported_reductions + ): + func, kwargs = numba_supported_reductions df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = df.groupby("a", sort=sort).mean( - engine="numba", engine_kwargs=engine_kwargs + gb = df.groupby("a", sort=sort) + result = getattr(gb, func)( + engine="numba", engine_kwargs=engine_kwargs, **kwargs ) - expected = df.groupby("a", sort=sort).mean() - tm.assert_frame_equal(result, expected) + expected = getattr(gb, func)(**kwargs) + # check_dtype can be removed if GH 44952 is addressed + check_dtype = func != "sum" + tm.assert_frame_equal(result, expected, check_dtype=check_dtype) - def test_cython_vs_numba_getitem(self, sort, nogil, parallel, nopython): + def test_cython_vs_numba_getitem( + self, sort, nogil, parallel, nopython, numba_supported_reductions + ): + func, kwargs = numba_supported_reductions df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = df.groupby("a", sort=sort)["c"].mean( - engine="numba", engine_kwargs=engine_kwargs + gb = df.groupby("a", sort=sort)["c"] + result = getattr(gb, func)( + engine="numba", engine_kwargs=engine_kwargs, **kwargs ) - expected = df.groupby("a", sort=sort)["c"].mean() - tm.assert_series_equal(result, expected) + expected = getattr(gb, func)(**kwargs) + # check_dtype can be removed if GH 44952 is addressed + check_dtype = func != "sum" + tm.assert_series_equal(result, expected, check_dtype=check_dtype) - def test_cython_vs_numba_series(self, sort, nogil, parallel, nopython): + def test_cython_vs_numba_series( + self, sort, nogil, parallel, nopython, numba_supported_reductions + ): + func, kwargs = numba_supported_reductions ser = Series(range(3), index=[1, 2, 1], name="foo") engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = ser.groupby(level=0, sort=sort).mean( - engine="numba", engine_kwargs=engine_kwargs + gb = ser.groupby(level=0, sort=sort) + result = getattr(gb, func)( + engine="numba", engine_kwargs=engine_kwargs, **kwargs ) - expected = ser.groupby(level=0, sort=sort).mean() - tm.assert_series_equal(result, expected) + expected = getattr(gb, func)(**kwargs) + # check_dtype can be removed if GH 44952 is addressed + check_dtype = func != "sum" + tm.assert_series_equal(result, expected, check_dtype=check_dtype) - def test_as_index_false_unsupported(self): + def test_as_index_false_unsupported(self, numba_supported_reductions): + func, kwargs = numba_supported_reductions df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) + gb = df.groupby("a", as_index=False) with pytest.raises(NotImplementedError, match="as_index=False"): - df.groupby("a", as_index=False).mean(engine="numba") + getattr(gb, func)(engine="numba", **kwargs) - def test_axis_1_unsupported(self): + def test_axis_1_unsupported(self, numba_supported_reductions): + func, kwargs = numba_supported_reductions df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) + gb = df.groupby("a", axis=1) with pytest.raises(NotImplementedError, match="axis=1"): - df.groupby("a", axis=1).mean(engine="numba") + getattr(gb, func)(engine="numba", **kwargs) diff --git a/pandas/tests/indexes/categorical/test_fillna.py b/pandas/tests/indexes/categorical/test_fillna.py index 400f236fcf8038..09de578f3c649e 100644 --- a/pandas/tests/indexes/categorical/test_fillna.py +++ b/pandas/tests/indexes/categorical/test_fillna.py @@ -25,17 +25,19 @@ def test_fillna_categorical(self): tm.assert_index_equal(result, expected) def test_fillna_copies_with_no_nas(self): - # Nothing to fill, should still get a copy + # Nothing to fill, should still get a copy for the Categorical method, + # but OK to get a view on CategoricalIndex method ci = CategoricalIndex([0, 1, 1]) - cat = ci._data result = ci.fillna(0) - assert result._values._ndarray is not cat._ndarray - assert result._values._ndarray.base is None + assert result is not ci + assert tm.shares_memory(result, ci) - # Same check directly on the Categorical object + # But at the EA level we always get a copy. + cat = ci._data result = cat.fillna(0) assert result._ndarray is not cat._ndarray assert result._ndarray.base is None + assert not tm.shares_memory(result, cat) def test_fillna_validates_with_no_nas(self): # We validate the fill value even if fillna is a no-op diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 2c8873acd83030..c60c74479f8b67 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -516,6 +516,11 @@ def test_fillna(self, index): idx = type(index)(values) + msg = "does not support 'downcast'" + with pytest.raises(NotImplementedError, match=msg): + # For now at least, we only raise if there are NAs present + idx.fillna(idx[0], downcast="infer") + expected = np.array([False] * len(idx), dtype=bool) expected[1] = True tm.assert_numpy_array_equal(idx._isnan, expected) @@ -665,6 +670,21 @@ def test_getitem_2d_deprecated(self, simple_index): assert isinstance(res, np.ndarray), type(res) + if not isinstance(idx, RangeIndex): + # GH#44051 RangeIndex already raises + with tm.assert_produces_warning(FutureWarning, match=msg): + res = idx[True] + assert isinstance(res, np.ndarray), type(res) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = idx[False] + assert isinstance(res, np.ndarray), type(res) + else: + msg = "only integers, slices" + with pytest.raises(IndexError, match=msg): + idx[True] + with pytest.raises(IndexError, match=msg): + idx[False] + def test_copy_shares_cache(self, simple_index): # GH32898, GH36840 idx = simple_index diff --git a/pandas/tests/indexes/datetimelike_/test_equals.py b/pandas/tests/indexes/datetimelike_/test_equals.py index 7221e560c1112d..cc90e8f6d9bec4 100644 --- a/pandas/tests/indexes/datetimelike_/test_equals.py +++ b/pandas/tests/indexes/datetimelike_/test_equals.py @@ -166,12 +166,17 @@ def test_equals2(self): assert not idx.equals(pd.Series(idx2)) # Check that we dont raise OverflowError on comparisons outside the - # implementation range + # implementation range GH#28532 oob = Index([timedelta(days=10 ** 6)] * 3, dtype=object) assert not idx.equals(oob) assert not idx2.equals(oob) - # FIXME: oob.apply(np.timedelta64) incorrectly overflows oob2 = Index([np.timedelta64(x) for x in oob], dtype=object) + assert (oob == oob2).all() assert not idx.equals(oob2) assert not idx2.equals(oob2) + + oob3 = oob.map(np.timedelta64) + assert (oob3 == oob).all() + assert not idx.equals(oob3) + assert not idx2.equals(oob3) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 9db6567ca1b565..c6afa3803bcb68 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -651,10 +651,6 @@ def test_get_indexer_mixed_dtypes(self, target): ([date(9999, 1, 1), date(9999, 1, 1)], [-1, -1]), ], ) - # FIXME: these warnings are flaky GH#36131 - @pytest.mark.filterwarnings( - "ignore:Comparison of Timestamp with datetime.date:FutureWarning" - ) def test_get_indexer_out_of_bounds_date(self, target, positions): values = DatetimeIndex([Timestamp("2020-01-01"), Timestamp("2020-01-02")]) diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index 5418f3a5964d9e..c44303aa2c8621 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -63,3 +63,9 @@ def test_getitem_2d_deprecated(self, simple_index): with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): with tm.assert_produces_warning(FutureWarning): idx[:, None] + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + # GH#44051 + idx[True] + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + # GH#44051 + idx[False] diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 0005e653694d8c..eed27cd450e9ca 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -71,6 +71,8 @@ def test_insert(idx): tm.assert_frame_equal(left, right, check_dtype=False) tm.assert_series_equal(ts, right["3rd"]) + +def test_insert2(): # GH9250 idx = ( [("test1", i) for i in range(5)] diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 4e6a0bb67cffe8..9f12d621556928 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -361,9 +361,7 @@ def test_union_sort_other_empty(slice_): # default, sort=None other = idx[slice_] tm.assert_index_equal(idx.union(other), idx) - # MultiIndex does not special case empty.union(idx) - # FIXME: don't leave commented-out - # tm.assert_index_equal(other.union(idx), idx) + tm.assert_index_equal(other.union(idx), idx) # sort=False tm.assert_index_equal(idx.union(other, sort=False), idx) diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index c2328872aee1ba..164ed3ec439967 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -16,6 +16,36 @@ class TestToTimestamp: + def test_to_timestamp_non_contiguous(self): + # GH#44100 + dti = date_range("2021-10-18", periods=9, freq="B") + pi = dti.to_period() + + result = pi[::2].to_timestamp() + expected = dti[::2] + tm.assert_index_equal(result, expected) + + result = pi._data[::2].to_timestamp() + expected = dti._data[::2] + # TODO: can we get the freq to round-trip? + tm.assert_datetime_array_equal(result, expected, check_freq=False) + + result = pi[::-1].to_timestamp() + expected = dti[::-1] + tm.assert_index_equal(result, expected) + + result = pi._data[::-1].to_timestamp() + expected = dti._data[::-1] + tm.assert_datetime_array_equal(result, expected, check_freq=False) + + result = pi[::2][::-1].to_timestamp() + expected = dti[::2][::-1] + tm.assert_index_equal(result, expected) + + result = pi._data[::2][::-1].to_timestamp() + expected = dti._data[::2][::-1] + tm.assert_datetime_array_equal(result, expected, check_freq=False) + def test_to_timestamp_freq(self): idx = period_range("2017", periods=12, freq="A-DEC") result = idx.to_timestamp() diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index f07107e9d3277b..e5c85edfaaffac 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -34,10 +34,6 @@ def simple_index(self) -> Index: def index(self, request): return request.param - @pytest.mark.xfail(reason="Goes through a generate_range path") - def test_pickle_compat_construction(self): - super().test_pickle_compat_construction() - def test_where(self): # This is handled in test_indexing pass diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index b383381d9a5c5a..c74a566cc573dd 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -816,14 +816,17 @@ def test_isin_nan_common_object(self, request, nulls_fixture, nulls_fixture2): ) def test_isin_nan_common_float64(self, request, nulls_fixture): - if nulls_fixture is pd.NaT: - pytest.skip("pd.NaT not compatible with Float64Index") - # Float64Index overrides isin, so must be checked separately - if nulls_fixture is pd.NA: - request.node.add_marker( - pytest.mark.xfail(reason="Float64Index cannot contain pd.NA") - ) + if nulls_fixture is pd.NaT or nulls_fixture is pd.NA: + # Check 1) that we cannot construct a Float64Index with this value + # and 2) that with an NaN we do not have .isin(nulls_fixture) + msg = "data is not compatible with Float64Index" + with pytest.raises(ValueError, match=msg): + Float64Index([1.0, nulls_fixture]) + + idx = Float64Index([1.0, np.nan]) + assert not idx.isin([nulls_fixture]).any() + return idx = Float64Index([1.0, nulls_fixture]) res = idx.isin([np.nan]) @@ -1257,13 +1260,19 @@ def test_copy_name2(self): assert index.name == "MyName" assert index2.name == "NewName" - index3 = index.copy(names=["NewName"]) + with tm.assert_produces_warning(FutureWarning): + index3 = index.copy(names=["NewName"]) tm.assert_index_equal(index, index3, check_names=False) assert index.name == "MyName" assert index.names == ["MyName"] assert index3.name == "NewName" assert index3.names == ["NewName"] + def test_copy_names_deprecated(self, simple_index): + # GH44916 + with tm.assert_produces_warning(FutureWarning): + simple_index.copy(names=["a"]) + def test_unique_na(self): idx = Index([2, np.nan, 2, 1], name="my_index") expected = Index([2, np.nan, 1], name="my_index") diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index fff11583e5161a..0407dc02833fd7 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -244,13 +244,17 @@ def test_unique(self, index_flat): result = i.unique() tm.assert_index_equal(result, expected) - def test_searchsorted_monotonic(self, index_flat): + def test_searchsorted_monotonic(self, index_flat, request): # GH17271 index = index_flat # not implemented for tuple searches in MultiIndex # or Intervals searches in IntervalIndex if isinstance(index, pd.IntervalIndex): - pytest.skip("Skip check for MultiIndex/IntervalIndex") + mark = pytest.mark.xfail( + reason="IntervalIndex.searchsorted does not support Interval arg", + raises=NotImplementedError, + ) + request.node.add_marker(mark) # nothing to test if the index is empty if index.empty: diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index deeaffaf5b9cc3..f44bbac1226e12 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -10,8 +10,6 @@ import numpy as np import pytest -from pandas.compat import np_datetime64_compat - from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas import ( @@ -273,9 +271,7 @@ def test_constructor_dtypes_to_float64(self, vals): [ [1, 2, 3], np.array([1, 2, 3], dtype=int), - np.array( - [np_datetime64_compat("2011-01-01"), np_datetime64_compat("2011-01-02")] - ), + np.array(["2011-01-01", "2011-01-02"], dtype="datetime64[ns]"), [datetime(2011, 1, 1), datetime(2011, 1, 2)], ], ) @@ -287,14 +283,7 @@ def test_constructor_dtypes_to_categorical(self, vals): @pytest.mark.parametrize( "vals", [ - Index( - np.array( - [ - np_datetime64_compat("2011-01-01"), - np_datetime64_compat("2011-01-02"), - ] - ) - ), + Index(np.array([np.datetime64("2011-01-01"), np.datetime64("2011-01-02")])), Index([datetime(2011, 1, 1), datetime(2011, 1, 2)]), ], ) diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 9acdd52178e0e2..94b9ca62e08edd 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -70,6 +70,14 @@ def test_take(self, index): with pytest.raises(AttributeError, match=msg): index.freq + def test_take_indexer_type(self): + # GH#42875 + integer_index = Index([0, 1, 2, 3]) + scalar_index = 1 + msg = "Expected indices to be array-like" + with pytest.raises(TypeError, match=msg): + integer_index.take(scalar_index) + def test_take_minus1_without_fill(self, index): # -1 does not get treated as NA unless allow_fill=True is passed if len(index) == 0: diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index b3c86f91927ee9..648b79bd288df1 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -272,13 +272,11 @@ def test_symmetric_difference(self, index): (None, None, None), ], ) - def test_corner_union(self, index_flat, fname, sname, expected_name): + def test_corner_union(self, index_flat_unique, fname, sname, expected_name): # GH#9943, GH#9862 # Test unions with various name combinations # Do not test MultiIndex or repeats - index = index_flat - if not index.is_unique: - pytest.skip("Not for MultiIndex or repeated indices") + index = index_flat_unique # Test copy.union(copy) first = index.copy().set_names(fname) @@ -318,10 +316,8 @@ def test_corner_union(self, index_flat, fname, sname, expected_name): (None, None, None), ], ) - def test_union_unequal(self, index_flat, fname, sname, expected_name): - index = index_flat - if not index.is_unique: - pytest.skip("Not for MultiIndex or repeated indices") + def test_union_unequal(self, index_flat_unique, fname, sname, expected_name): + index = index_flat_unique # test copy.union(subset) - need sort for unicode and string first = index.copy().set_names(fname) @@ -340,12 +336,10 @@ def test_union_unequal(self, index_flat, fname, sname, expected_name): (None, None, None), ], ) - def test_corner_intersect(self, index_flat, fname, sname, expected_name): + def test_corner_intersect(self, index_flat_unique, fname, sname, expected_name): # GH#35847 # Test intersections with various name combinations - index = index_flat - if not index.is_unique: - pytest.skip("Not for MultiIndex or repeated indices") + index = index_flat_unique # Test copy.intersection(copy) first = index.copy().set_names(fname) @@ -385,10 +379,8 @@ def test_corner_intersect(self, index_flat, fname, sname, expected_name): (None, None, None), ], ) - def test_intersect_unequal(self, index_flat, fname, sname, expected_name): - index = index_flat - if not index.is_unique: - pytest.skip("Not for MultiIndex or repeated indices") + def test_intersect_unequal(self, index_flat_unique, fname, sname, expected_name): + index = index_flat_unique # test copy.intersection(subset) - need sort for unicode and string first = index.copy().set_names(fname) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index d036773c778e6f..1756cc3ae707c3 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -379,6 +379,26 @@ def test_multiindex_setitem_columns_enlarging(self, indexer, exp_value): ) tm.assert_frame_equal(df, expected) + def test_sorted_multiindex_after_union(self): + # GH#44752 + midx = MultiIndex.from_product( + [pd.date_range("20110101", periods=2), Index(["a", "b"])] + ) + ser1 = Series(1, index=midx) + ser2 = Series(1, index=midx[:2]) + df = pd.concat([ser1, ser2], axis=1) + expected = df.copy() + result = df.loc["2011-01-01":"2011-01-02"] + tm.assert_frame_equal(result, expected) + + df = DataFrame({0: ser1, 1: ser2}) + result = df.loc["2011-01-01":"2011-01-02"] + tm.assert_frame_equal(result, expected) + + df = pd.concat([ser1, ser2.reindex(ser1.index)], axis=1) + result = df.loc["2011-01-01":"2011-01-02"] + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "indexer, pos", diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 2366dd39c25f20..014f0f59333871 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1177,6 +1177,7 @@ def test_replace_series_datetime_tz(self, how, to_key, from_key, replacer): assert obj.dtype == from_key result = obj.replace(replacer) + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key @@ -1197,7 +1198,21 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) + warn = None + rep_ser = pd.Series(replacer) + if ( + isinstance(obj.dtype, pd.DatetimeTZDtype) + and isinstance(rep_ser.dtype, pd.DatetimeTZDtype) + and obj.dtype != rep_ser.dtype + ): + # mismatched tz DatetimeArray behavior will change to cast + # for setitem-like methods with mismatched tzs GH#44940 + warn = FutureWarning + + msg = "explicitly cast to object" + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 688f940e2b3fbf..358689839d6af8 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -712,8 +712,8 @@ def test_str_label_slicing_with_negative_step(self): ser, SLC[idx[9] : idx[13] : -1], SLC[:0] ) - def test_slice_with_zero_step_raises(self, indexer_sl, frame_or_series): - obj = frame_or_series(np.arange(20), index=_mklbl("A", 20)) + def test_slice_with_zero_step_raises(self, index, indexer_sl, frame_or_series): + obj = frame_or_series(np.arange(len(index)), index=index) with pytest.raises(ValueError, match="slice step cannot be zero"): indexer_sl(obj)[::0] diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index b577bc7e436df9..ef313b2840107e 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -149,8 +149,10 @@ def create_block(typestr, placement, item_shape=None, num_offset=0, maker=new_bl elif typestr in ("category2",): values = Categorical(["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"]) elif typestr in ("sparse", "sparse_na"): - # FIXME: doesn't support num_rows != 10 - assert shape[-1] == 10 + if shape[-1] != 10: + # We also are implicitly assuming this in the category cases above + raise NotImplementedError + assert all(s == 1 for s in shape[:-1]) if typestr.endswith("_na"): fill_value = np.nan @@ -492,7 +494,6 @@ def test_copy(self, mgr): def test_sparse(self): mgr = create_mgr("a: sparse-1; b: sparse-2") - # what to test here? assert mgr.as_array().dtype == np.float64 def test_sparse_mixed(self): @@ -500,8 +501,6 @@ def test_sparse_mixed(self): assert len(mgr.blocks) == 3 assert isinstance(mgr, BlockManager) - # TODO: what to test here? - @pytest.mark.parametrize( "mgr_string, dtype", [("c: f4; d: f2", np.float32), ("c: f4; d: f2; e: f8", np.float64)], @@ -1275,7 +1274,7 @@ def test_interval_can_hold_element(self, dtype, element): def test_period_can_hold_element_emptylist(self): pi = period_range("2016", periods=3, freq="A") - blk = new_block(pi._data, [1], ndim=2) + blk = new_block(pi._data.reshape(1, 3), [1], ndim=2) assert blk._can_hold_element([]) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 8291d0c85b50d5..0315783569c238 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -207,14 +207,18 @@ def test_read_excel_multiindex_empty_level(self, ext): @pytest.mark.parametrize("c_idx_levels", [1, 3]) @pytest.mark.parametrize("r_idx_levels", [1, 3]) def test_excel_multindex_roundtrip( - self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels + self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels, request ): # see gh-4679 with tm.ensure_clean(ext) as pth: - if c_idx_levels == 1 and c_idx_names: - pytest.skip( - "Column index name cannot be serialized unless it's a MultiIndex" + if (c_idx_levels == 1 and c_idx_names) and not ( + r_idx_levels == 3 and not r_idx_names + ): + mark = pytest.mark.xfail( + reason="Column index name cannot be serialized unless " + "it's a MultiIndex" ) + request.node.add_marker(mark) # Empty name case current read in as # unnamed levels, not Nones. diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 059fd96db43ad3..8815423d95d653 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -498,9 +498,6 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): # see gh-15008 compression = compression_only - if compression == "zip": - pytest.skip(f"{compression} is not supported for to_csv") - # We'll complete file extension subsequently. filename = "test." diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index a61e77bec98288..aa8508d8e89426 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -452,43 +452,59 @@ def test_to_html_invalid_justify(justify): df.to_html(justify=justify) -def test_to_html_index(datapath): - # TODO: split this test - index = ["foo", "bar", "baz"] - df = DataFrame( - {"A": [1, 2, 3], "B": [1.2, 3.4, 5.6], "C": ["one", "two", np.nan]}, - columns=["A", "B", "C"], - index=index, - ) - expected_with_index = expected_html(datapath, "index_1") - assert df.to_html() == expected_with_index - - expected_without_index = expected_html(datapath, "index_2") - result = df.to_html(index=False) - for i in index: - assert i not in result - assert result == expected_without_index - df.index = Index(["foo", "bar", "baz"], name="idx") - expected_with_index = expected_html(datapath, "index_3") - assert df.to_html() == expected_with_index - assert df.to_html(index=False) == expected_without_index - - tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] - df.index = MultiIndex.from_tuples(tuples) - - expected_with_index = expected_html(datapath, "index_4") - assert df.to_html() == expected_with_index +class TestHTMLIndex: + @pytest.fixture + def df(self): + index = ["foo", "bar", "baz"] + df = DataFrame( + {"A": [1, 2, 3], "B": [1.2, 3.4, 5.6], "C": ["one", "two", np.nan]}, + columns=["A", "B", "C"], + index=index, + ) + return df - result = df.to_html(index=False) - for i in ["foo", "bar", "car", "bike"]: - assert i not in result - # must be the same result as normal index - assert result == expected_without_index - - df.index = MultiIndex.from_tuples(tuples, names=["idx1", "idx2"]) - expected_with_index = expected_html(datapath, "index_5") - assert df.to_html() == expected_with_index - assert df.to_html(index=False) == expected_without_index + @pytest.fixture + def expected_without_index(self, datapath): + return expected_html(datapath, "index_2") + + def test_to_html_flat_index_without_name( + self, datapath, df, expected_without_index + ): + expected_with_index = expected_html(datapath, "index_1") + assert df.to_html() == expected_with_index + + result = df.to_html(index=False) + for i in df.index: + assert i not in result + assert result == expected_without_index + + def test_to_html_flat_index_with_name(self, datapath, df, expected_without_index): + df.index = Index(["foo", "bar", "baz"], name="idx") + expected_with_index = expected_html(datapath, "index_3") + assert df.to_html() == expected_with_index + assert df.to_html(index=False) == expected_without_index + + def test_to_html_multiindex_without_names( + self, datapath, df, expected_without_index + ): + tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] + df.index = MultiIndex.from_tuples(tuples) + + expected_with_index = expected_html(datapath, "index_4") + assert df.to_html() == expected_with_index + + result = df.to_html(index=False) + for i in ["foo", "bar", "car", "bike"]: + assert i not in result + # must be the same result as normal index + assert result == expected_without_index + + def test_to_html_multiindex_with_names(self, datapath, df, expected_without_index): + tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] + df.index = MultiIndex.from_tuples(tuples, names=["idx1", "idx2"]) + expected_with_index = expected_html(datapath, "index_5") + assert df.to_html() == expected_with_index + assert df.to_html(index=False) == expected_without_index @pytest.mark.parametrize("classes", ["sortable draggable", ["sortable", "draggable"]]) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index febeb4d6905623..e0136520bdeb56 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -95,9 +95,6 @@ def test_to_json_compression(compression_only, read_infer, to_infer): # see gh-15008 compression = compression_only - if compression == "zip": - pytest.skip(f"{compression} is not supported for to_csv") - # We'll complete file extension subsequently. filename = "test." diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 89b8783462f7ee..b204d3bb97b6ea 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -185,14 +185,11 @@ def test_as_json_table_type_date_dtypes(self, date_dtype): def test_as_json_table_type_timedelta_dtypes(self, td_dtype): assert as_json_table_type(td_dtype) == "duration" - @pytest.mark.parametrize("str_dtype", [object]) # TODO + @pytest.mark.parametrize("str_dtype", [object]) # TODO(GH#14904) flesh out dtypes? def test_as_json_table_type_string_dtypes(self, str_dtype): assert as_json_table_type(str_dtype) == "string" def test_as_json_table_type_categorical_dtypes(self): - # TODO: I think before is_categorical_dtype(Categorical) - # returned True, but now it's False. Figure out why or - # if it matters assert as_json_table_type(pd.Categorical(["a"]).dtype) == "any" assert as_json_table_type(CategoricalDtype()) == "any" diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py new file mode 100644 index 00000000000000..3daac204aa7303 --- /dev/null +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -0,0 +1,265 @@ +"""Tests for ExtensionDtype Table Schema integration.""" + +from collections import OrderedDict +import datetime as dt +import decimal +import json + +import pytest + +from pandas import ( + DataFrame, + array, +) +from pandas.core.arrays.integer import Int64Dtype +from pandas.core.arrays.string_ import StringDtype +from pandas.core.series import Series +from pandas.tests.extension.date import ( + DateArray, + DateDtype, +) +from pandas.tests.extension.decimal.array import ( + DecimalArray, + DecimalDtype, +) + +from pandas.io.json._table_schema import ( + as_json_table_type, + build_table_schema, +) + + +class TestBuildSchema: + def setup_method(self, method): + self.da = DateArray([dt.date(2021, 10, 10)]) + self.dc = DecimalArray([decimal.Decimal(10)]) + self.sa = array(["pandas"], dtype="string") + self.ia = array([10], dtype="Int64") + self.df = DataFrame( + { + "A": self.da, + "B": self.dc, + "C": self.sa, + "D": self.ia, + } + ) + + def test_build_table_schema(self): + result = build_table_schema(self.df, version=False) + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "A", "type": "any", "extDtype": "DateDtype"}, + {"name": "B", "type": "any", "extDtype": "decimal"}, + {"name": "C", "type": "any", "extDtype": "string"}, + {"name": "D", "type": "integer", "extDtype": "Int64"}, + ], + "primaryKey": ["index"], + } + assert result == expected + result = build_table_schema(self.df) + assert "pandas_version" in result + + +class TestTableSchemaType: + @pytest.mark.parametrize( + "date_data", + [ + DateArray([dt.date(2021, 10, 10)]), + DateArray(dt.date(2021, 10, 10)), + Series(DateArray(dt.date(2021, 10, 10))), + ], + ) + def test_as_json_table_type_ext_date_array_dtype(self, date_data): + assert as_json_table_type(date_data.dtype) == "any" + + def test_as_json_table_type_ext_date_dtype(self): + assert as_json_table_type(DateDtype()) == "any" + + @pytest.mark.parametrize( + "decimal_data", + [ + DecimalArray([decimal.Decimal(10)]), + Series(DecimalArray([decimal.Decimal(10)])), + ], + ) + def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data): + assert as_json_table_type(decimal_data.dtype) == "any" + + def test_as_json_table_type_ext_decimal_dtype(self): + assert as_json_table_type(DecimalDtype()) == "any" + + @pytest.mark.parametrize( + "string_data", + [ + array(["pandas"], dtype="string"), + Series(array(["pandas"], dtype="string")), + ], + ) + def test_as_json_table_type_ext_string_array_dtype(self, string_data): + assert as_json_table_type(string_data.dtype) == "any" + + def test_as_json_table_type_ext_string_dtype(self): + assert as_json_table_type(StringDtype()) == "any" + + @pytest.mark.parametrize( + "integer_data", + [ + array([10], dtype="Int64"), + Series(array([10], dtype="Int64")), + ], + ) + def test_as_json_table_type_ext_integer_array_dtype(self, integer_data): + assert as_json_table_type(integer_data.dtype) == "integer" + + def test_as_json_table_type_ext_integer_dtype(self): + assert as_json_table_type(Int64Dtype()) == "integer" + + +class TestTableOrient: + def setup_method(self, method): + self.da = DateArray([dt.date(2021, 10, 10)]) + self.dc = DecimalArray([decimal.Decimal(10)]) + self.sa = array(["pandas"], dtype="string") + self.ia = array([10], dtype="Int64") + self.df = DataFrame( + { + "A": self.da, + "B": self.dc, + "C": self.sa, + "D": self.ia, + } + ) + + def test_build_date_series(self): + s = Series(self.da, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "any", "extDtype": "DateDtype"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000Z")])]), + ] + ) + + assert result == expected + + def test_build_decimal_series(self): + s = Series(self.dc, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "any", "extDtype": "decimal"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", 10.0)])]), + ] + ) + + assert result == expected + + def test_build_string_series(self): + s = Series(self.sa, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "any", "extDtype": "string"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", "pandas")])]), + ] + ) + + assert result == expected + + def test_build_int64_series(self): + s = Series(self.ia, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "integer", "extDtype": "Int64"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", 10)])]), + ] + ) + + assert result == expected + + def test_to_json(self): + df = self.df.copy() + df.index.name = "idx" + result = df.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + OrderedDict({"name": "idx", "type": "integer"}), + OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}), + OrderedDict({"name": "B", "type": "any", "extDtype": "decimal"}), + OrderedDict({"name": "C", "type": "any", "extDtype": "string"}), + OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}), + ] + + schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]}) + data = [ + OrderedDict( + [ + ("idx", 0), + ("A", "2021-10-10T00:00:00.000Z"), + ("B", 10.0), + ("C", "pandas"), + ("D", 10), + ] + ) + ] + expected = OrderedDict([("schema", schema), ("data", data)]) + + assert result == expected diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f228c826bc7952..1cfda5c096fba1 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -22,7 +22,6 @@ DatetimeIndex, Series, Timestamp, - compat, read_json, ) import pandas._testing as tm @@ -245,7 +244,8 @@ def test_roundtrip_categorical(self, request, orient, convert_axes, numpy): @pytest.mark.parametrize("convert_axes", [True, False]) @pytest.mark.parametrize("numpy", [True, False]) - def test_roundtrip_empty(self, orient, convert_axes, numpy, empty_frame): + def test_roundtrip_empty(self, orient, convert_axes, numpy): + empty_frame = DataFrame() data = empty_frame.to_json(orient=orient) result = read_json(data, orient=orient, convert_axes=convert_axes, numpy=numpy) expected = empty_frame.copy() @@ -674,7 +674,8 @@ def test_series_roundtrip_object(self, orient, numpy, dtype, object_series): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_empty(self, orient, numpy, empty_series): + def test_series_roundtrip_empty(self, orient, numpy): + empty_series = Series([], index=[], dtype=np.float64) data = empty_series.to_json(orient=orient) result = read_json(data, typ="series", orient=orient, numpy=numpy) @@ -1275,11 +1276,9 @@ def test_to_json_large_numbers(self, bigNum): expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) - @pytest.mark.skipif(not compat.IS64, reason="GH-35279") + @pytest.mark.parametrize("bigNum", [-(2 ** 63) - 1, 2 ** 64]) def test_read_json_large_numbers(self, bigNum): - # GH20599 - + # GH20599, 26068 json = StringIO('{"articleId":' + str(bigNum) + "}") msg = r"Value is too small|Value is too big" with pytest.raises(ValueError, match=msg): @@ -1324,10 +1323,9 @@ def test_to_jsonl(self): tm.assert_frame_equal(read_json(result, lines=True), df) # TODO: there is a near-identical test for pytables; can we share? + @pytest.mark.xfail(reason="GH#13774 encoding kwarg not supported", raises=TypeError) def test_latin_encoding(self): # GH 13774 - pytest.skip("encoding not implemented in .to_json(), xref #13774") - values = [ [b"E\xc9, 17", b"", b"a", b"b", b"c"], [b"E\xc9, 17", b"a", b"b", b"c"], @@ -1675,7 +1673,7 @@ def test_to_json_indent(self, indent): "primaryKey":[ "index" ], - "pandas_version":"0.20.0" + "pandas_version":"1.4.0" }, "data":[ { diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index b5c22e959b4d7a..b4ae54d48dc68a 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -5,7 +5,6 @@ import locale import math import re -import sys import time import dateutil @@ -599,24 +598,23 @@ def test_encode_list_long_conversion(self): np.array(long_input), ujson.decode(output, numpy=True, dtype=np.int64) ) - def test_encode_long_conversion(self): - long_input = 9223372036854775807 + @pytest.mark.parametrize("long_input", [9223372036854775807, 18446744073709551615]) + def test_encode_long_conversion(self, long_input): output = ujson.encode(long_input) assert long_input == json.loads(output) assert output == json.dumps(long_input) assert long_input == ujson.decode(output) - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) - @pytest.mark.xfail(not IS64, reason="GH-35288") + @pytest.mark.parametrize("bigNum", [2 ** 64, -(2 ** 63) - 1]) def test_dumps_ints_larger_than_maxsize(self, bigNum): - # GH34395 - bigNum = sys.maxsize + 1 encoding = ujson.encode(bigNum) assert str(bigNum) == encoding - # GH20599 - with pytest.raises(ValueError, match="Value is too big"): + with pytest.raises( + ValueError, + match="Value is too big|Value is too small", + ): assert ujson.loads(encoding) == bigNum @pytest.mark.parametrize( @@ -1162,11 +1160,12 @@ def test_decode_array(self, arr): def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.decode(str(extreme_num)) - @pytest.mark.parametrize( - "too_extreme_num", ["9223372036854775808", "-90223372036854775809"] - ) + @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-2**63-1}"]) def test_decode_too_extreme_numbers(self, too_extreme_num): - with pytest.raises(ValueError, match="Value is too big|Value is too small"): + with pytest.raises( + ValueError, + match="Value is too big|Value is too small", + ): ujson.decode(too_extreme_num) def test_decode_with_trailing_whitespaces(self): @@ -1176,9 +1175,13 @@ def test_decode_with_trailing_non_whitespaces(self): with pytest.raises(ValueError, match="Trailing data"): ujson.decode("{}\n\t a") - def test_decode_array_with_big_int(self): - with pytest.raises(ValueError, match="Value is too big"): - ujson.loads("[18446098363113800555]") + @pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"]) + def test_decode_array_with_big_int(self, value): + with pytest.raises( + ValueError, + match="Value is too big|Value is too small", + ): + ujson.loads(value) @pytest.mark.parametrize( "float_number", diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 4c26047d98acc9..8a3f8788a45aaa 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -162,7 +162,6 @@ def test_chunk_begins_with_newline_whitespace(all_parsers): @pytest.mark.slow -@pytest.mark.xfail(reason="GH38630, sometimes gives ResourceWarning", strict=False) def test_chunks_have_consistent_numerical_type(all_parsers): parser = all_parsers integers = [str(i) for i in range(499999)] @@ -176,7 +175,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers): assert result.a.dtype == float -def test_warn_if_chunks_have_mismatched_type(all_parsers, request): +def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers size = 10000 @@ -193,24 +192,8 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers, request): buf = StringIO(data) - try: - with tm.assert_produces_warning(warning_type): - df = parser.read_csv(buf) - except AssertionError as err: - # 2021-02-21 this occasionally fails on the CI with an unexpected - # ResourceWarning that we have been unable to track down, - # see GH#38630 - if "ResourceWarning" not in str(err) or parser.engine != "python": - raise - - # Check the main assertion of the test before re-raising - assert df.a.dtype == object - - mark = pytest.mark.xfail( - reason="ResourceWarning for unclosed SSL Socket, GH#38630" - ) - request.node.add_marker(mark) - raise + with tm.assert_produces_warning(warning_type): + df = parser.read_csv(buf) assert df.a.dtype == object diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 85dac4ad89fe43..bde69e365cfd19 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -788,6 +788,22 @@ def test_read_csv_delimiter_and_sep_no_default(all_parsers): parser.read_csv(f, sep=" ", delimiter=".") +@pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}]) +def test_read_csv_line_break_as_separator(kwargs, all_parsers): + # GH#43528 + parser = all_parsers + data = """a,b,c +1,2,3 + """ + msg = ( + r"Specified \\n as separator or delimiter. This forces the python engine " + r"which does not accept a line terminator. Hence it is not allowed to use " + r"the line terminator as separator." + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + + def test_read_csv_posargs_deprecation(all_parsers): # GH 41485 f = StringIO("a,b\n1,2") diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index 11ef9d7d691227..4a8f734a34abf0 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -28,7 +28,6 @@ @tm.network def test_url(all_parsers, csv_dir_path): - # TODO: FTP testing parser = all_parsers kwargs = {"sep": "\t"} @@ -347,25 +346,6 @@ def test_read_csv_file_handle(all_parsers, io_class, encoding): assert not handle.closed -def test_memory_map_file_handle_silent_fallback(all_parsers, compression): - """ - Do not fail for buffers with memory_map=True (cannot memory map BytesIO). - - GH 37621 - """ - parser = all_parsers - expected = DataFrame({"a": [1], "b": [2]}) - - handle = BytesIO() - expected.to_csv(handle, index=False, compression=compression, mode="wb") - handle.seek(0) - - tm.assert_frame_equal( - parser.read_csv(handle, memory_map=True, compression=compression), - expected, - ) - - def test_memory_map_compression(all_parsers, compression): """ Support memory map for compressed files. diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 993f52b00334f0..2b27332c7e85b1 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -299,17 +299,16 @@ def test_readcsv_memmap_utf8(all_parsers): tm.assert_frame_equal(df, dfr) -def test_not_readable(all_parsers, request): +@pytest.mark.usefixtures("pyarrow_xfail") +@pytest.mark.parametrize("mode", ["w+b", "w+t"]) +def test_not_readable(all_parsers, mode): # GH43439 parser = all_parsers - if parser.engine in ("python", "pyarrow"): - mark = pytest.mark.xfail( - reason="SpooledTemporaryFile does only work with the c-engine" - ) - request.node.add_marker(mark) - - with tempfile.SpooledTemporaryFile() as handle: - handle.write(b"abcd") + content = b"abcd" + if "t" in mode: + content = "abcd" + with tempfile.SpooledTemporaryFile(mode=mode) as handle: + handle.write(content) handle.seek(0) df = parser.read_csv(handle) expected = DataFrame([], columns=["abcd"]) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index b0742f5b41a92a..3fc23525df89e4 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -557,26 +557,21 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): else: data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n" + result = parser.read_csv(StringIO(data), header=header, index_col=index_col) + exp_columns = [] + if columns is None: - msg = ( - r"Passed header=\[0,1\] are too " - r"many rows for this multi_index of columns" - ) - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=header, index_col=index_col) - else: - result = parser.read_csv(StringIO(data), header=header, index_col=index_col) - exp_columns = [] + columns = ["", "", ""] - for i, col in enumerate(columns): - if not col: # Unnamed. - col = f"Unnamed: {i if index_col is None else i + 1}_level_0" + for i, col in enumerate(columns): + if not col: # Unnamed. + col = f"Unnamed: {i if index_col is None else i + 1}_level_0" - exp_columns.append(col) + exp_columns.append(col) - columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) - expected = DataFrame([[2, 3], [4, 5]], columns=columns) - tm.assert_frame_equal(result, expected) + columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) + expected = DataFrame([[2, 3], [4, 5]], columns=columns) + tm.assert_frame_equal(result, expected) @skip_pyarrow diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 58b5eebbec3444..f30aba3db917ef 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -332,3 +332,23 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val): result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) expected = DataFrame({"b": [2]}, index=Index([val], name="a")) tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_multiindex_columns_not_leading_index_col(all_parsers): + # GH#38549 + parser = all_parsers + data = """a,b,c,d +e,f,g,h +x,y,1,2 +""" + result = parser.read_csv( + StringIO(data), + header=[0, 1], + index_col=1, + ) + cols = MultiIndex.from_tuples( + [("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"] + ) + expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 101d3b565712da..f9356dfc7d0e36 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -17,6 +17,7 @@ import pandas._testing as tm skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @skip_pyarrow @@ -615,3 +616,41 @@ def test_nan_multi_index(all_parsers): ) tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_bool_and_nan_to_bool(all_parsers): + # GH#42808 + parser = all_parsers + data = """0 +NaN +True +False +""" + with pytest.raises(ValueError, match="NA values"): + parser.read_csv(StringIO(data), dtype="bool") + + +def test_bool_and_nan_to_int(all_parsers): + # GH#42808 + parser = all_parsers + data = """0 +NaN +True +False +""" + with pytest.raises(ValueError, match="convert|NoneType"): + parser.read_csv(StringIO(data), dtype="int") + + +def test_bool_and_nan_to_float(all_parsers): + # GH#42808 + parser = all_parsers + data = """0 +NaN +True +False +""" + result = parser.read_csv(StringIO(data), dtype="float") + expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 68fc86c9586eec..1dfd81366de72b 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -21,7 +21,6 @@ from pandas._libs.tslibs import parsing from pandas._libs.tslibs.parsing import parse_datetime_string -from pandas.compat import np_array_datetime64_compat from pandas.compat.pyarrow import pa_version_under6p0 import pandas as pd @@ -1541,7 +1540,7 @@ def test_date_parser_resolution_if_not_ns(all_parsers): """ def date_parser(dt, time): - return np_array_datetime64_compat(dt + "T" + time + "Z", dtype="datetime64[s]") + return np.array(dt + "T" + time, dtype="datetime64[s]") result = parser.read_csv( StringIO(data), @@ -1550,9 +1549,7 @@ def date_parser(dt, time): index_col=["datetime", "prn"], ) - datetimes = np_array_datetime64_compat( - ["2013-11-03T19:00:00Z"] * 3, dtype="datetime64[s]" - ) + datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]") expected = DataFrame( data={"rxstatus": ["00E80000"] * 3}, index=MultiIndex.from_tuples( diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index cdfc9f71f169ce..584cf9e5331dc0 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -699,15 +699,15 @@ def test_encoding_mmap(memory_map): GH 23254. """ encoding = "iso8859_1" - data = BytesIO(" 1 A Ä 2\n".encode(encoding)) - df = read_fwf( - data, - header=None, - widths=[2, 2, 2, 2], - encoding=encoding, - memory_map=memory_map, - ) - data.seek(0) + with tm.ensure_clean() as path: + Path(path).write_bytes(" 1 A Ä 2\n".encode(encoding)) + df = read_fwf( + path, + header=None, + widths=[2, 2, 2, 2], + encoding=encoding, + memory_map=memory_map, + ) df_reference = DataFrame([[1, "A", "Ä", 2]]) tm.assert_frame_equal(df, df_reference) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index d0080273537bb1..f35caf38c847f1 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -174,6 +174,28 @@ def test_usecols_implicit_index_col(all_parsers): tm.assert_frame_equal(result, expected) +def test_usecols_index_col_middle(all_parsers): + # GH#9098 + parser = all_parsers + data = """a,b,c,d +1,2,3,4 +""" + result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c") + expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c")) + tm.assert_frame_equal(result, expected) + + +def test_usecols_index_col_end(all_parsers): + # GH#9098 + parser = all_parsers + data = """a,b,c,d +1,2,3,4 +""" + result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d") + expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d")) + tm.assert_frame_equal(result, expected) + + def test_usecols_regex_sep(all_parsers): # see gh-2733 parser = all_parsers diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index fc8d4506abda06..b644c3420150c9 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -188,12 +188,12 @@ def test_select_dtypes(setup_path): _maybe_remove(store, "df") store.append("df", df, data_columns=True) - expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa + expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa:E712 for v in [True, "true", 1]: result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) tm.assert_frame_equal(expected, result) - expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa + expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa:E712 for v in [False, "false", 0]: result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 83c86d4da05e64..cbca8bb64e3503 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -972,7 +972,8 @@ def test_columns_multiindex_modified(setup_path): ) cols2load = list("BCD") cols2load_original = list(cols2load) - df_loaded = read_hdf(path, "df", columns=cols2load) # noqa + # GH#10055 make sure read_hdf call does not alter cols2load inplace + read_hdf(path, "df", columns=cols2load) assert cols2load_original == cols2load diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a782f8dbbc76dd..a00268d82a57d4 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -7,6 +7,7 @@ from io import ( BytesIO, StringIO, + UnsupportedOperation, ) import mmap import os @@ -602,3 +603,9 @@ def test_errno_attribute(): with pytest.raises(FileNotFoundError, match="\\[Errno 2\\]") as err: pd.read_csv("doesnt_exist") assert err.errno == errno.ENOENT + + +def test_fail_mmap(): + with pytest.raises(UnsupportedOperation, match="fileno"): + with BytesIO() as buffer: + icom.get_handle(buffer, "rb", memory_map=True) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 59c7abc4a4cb8b..15d41c56c13c17 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,6 +2,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under2p0 + import pandas as pd import pandas._testing as tm @@ -85,7 +87,11 @@ def test_basic(self): ), } ) - df["periods"] = pd.period_range("2013", freq="M", periods=3) + if not pa_version_under2p0: + # older pyarrow incorrectly uses pandas internal API, so + # constructs invalid Block + df["periods"] = pd.period_range("2013", freq="M", periods=3) + df["timedeltas"] = pd.timedelta_range("1 day", periods=3) df["intervals"] = pd.interval_range(0, 3, 3) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f6ae5ebfdf526e..3aac7e95e6591a 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -14,7 +14,6 @@ import pytest from pandas.compat import is_platform_windows -from pandas.errors import ParserError import pandas.util._test_decorators as td from pandas import ( @@ -918,13 +917,8 @@ def test_wikipedia_states_multiindex(self, datapath): assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04) def test_parser_error_on_empty_header_row(self): - msg = ( - r"Passed header=\[0,1\] are too many " - r"rows for this multi_index of columns" - ) - with pytest.raises(ParserError, match=msg): - self.read_html( - """ + result = self.read_html( + """ @@ -935,8 +929,15 @@ def test_parser_error_on_empty_header_row(self):
""", - header=[0, 1], - ) + header=[0, 1], + ) + expected = DataFrame( + [["a", "b"]], + columns=MultiIndex.from_tuples( + [("Unnamed: 0_level_0", "A"), ("Unnamed: 1_level_0", "B")] + ), + ) + tm.assert_frame_equal(result[0], expected) def test_decimal_rows(self): # GH 12907 @@ -1166,6 +1167,10 @@ def test_displayed_only(self, displayed_only, exp0, exp1): else: assert len(dfs) == 1 # Should not parse hidden table + @pytest.mark.filterwarnings( + "ignore:You provided Unicode markup but also provided a value for " + "from_encoding.*:UserWarning" + ) def test_encode(self, html_encoding_file): base_path = os.path.basename(html_encoding_file) root = os.path.splitext(base_path)[0] diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 50d9b75fe9d811..0bd291cea894e7 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -648,7 +648,15 @@ def test_use_nullable_dtypes(self, engine, request): "object", "datetime64[ns, UTC]", "float", - "period[D]", + pytest.param( + "period[D]", + # Note: I don't know exactly what version the cutoff is; + # On the CI it fails with 1.0.1 + marks=pytest.mark.xfail( + pa_version_under2p0, + reason="pyarrow uses pandas internal API incorrectly", + ), + ), "Float64", "string", ], @@ -887,6 +895,9 @@ def test_pyarrow_backed_string_array(self, pa, string_storage): check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) @td.skip_if_no("pyarrow") + @pytest.mark.xfail( + pa_version_under2p0, reason="pyarrow uses pandas internal API incorrectly" + ) def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol + by defining a custom ExtensionType diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 92a53a443b2171..cb8ee4891a41e5 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2128,7 +2128,7 @@ def test_get_engine_auto_error_message(self): # Expect different error messages from get_engine(engine="auto") # if engines aren't installed vs. are installed but bad version pass - # TODO fill this in when we add more engines + # TODO(GH#36893) fill this in when we add more engines class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): diff --git a/pandas/tests/io/test_user_agent.py b/pandas/tests/io/test_user_agent.py index cabdbbdb448307..fb6128bd514f9d 100644 --- a/pandas/tests/io/test_user_agent.py +++ b/pandas/tests/io/test_user_agent.py @@ -4,7 +4,10 @@ import gzip import http.server from io import BytesIO -import threading +import multiprocessing +import socket +import time +import urllib.error import pytest @@ -39,11 +42,10 @@ def gzip_bytes(self, response_bytes): """ some web servers will send back gzipped files to save bandwidth """ - bio = BytesIO() - zipper = gzip.GzipFile(fileobj=bio, mode="w") - zipper.write(response_bytes) - zipper.close() - response_bytes = bio.getvalue() + with BytesIO() as bio: + with gzip.GzipFile(fileobj=bio, mode="w") as zipper: + zipper.write(response_bytes) + response_bytes = bio.getvalue() return response_bytes def write_back_bytes(self, response_bytes): @@ -178,6 +180,56 @@ def do_GET(self): self.wfile.write(response_bytes) +def wait_until_ready(func, *args, **kwargs): + def inner(*args, **kwargs): + while True: + try: + return func(*args, **kwargs) + except urllib.error.URLError: + # Connection refused as http server is starting + time.sleep(0.1) + + return inner + + +def process_server(responder, port): + with http.server.HTTPServer(("localhost", port), responder) as server: + server.handle_request() + server.server_close() + + +@pytest.fixture +def responder(request): + """ + Fixture that starts a local http server in a separate process on localhost + and returns the port. + + Running in a separate process instead of a thread to allow termination/killing + of http server upon cleanup. + """ + # Find an available port + with socket.socket() as sock: + sock.bind(("localhost", 0)) + port = sock.getsockname()[1] + + server_process = multiprocessing.Process( + target=process_server, args=(request.param, port) + ) + server_process.start() + yield port + server_process.terminate() + kill_time = 5 + wait_time = 0 + while server_process.is_alive(): + if wait_time > kill_time: + server_process.kill() + break + else: + wait_time += 0.1 + time.sleep(0.1) + server_process.close() + + @pytest.mark.parametrize( "responder, read_method, parquet_engine", [ @@ -196,6 +248,7 @@ def do_GET(self): (GzippedCSVUserAgentResponder, pd.read_csv, None), (GzippedJSONUserAgentResponder, pd.read_json, None), ], + indirect=["responder"], ) def test_server_and_default_headers(responder, read_method, parquet_engine): if parquet_engine is not None: @@ -203,19 +256,12 @@ def test_server_and_default_headers(responder, read_method, parquet_engine): if parquet_engine == "fastparquet": pytest.importorskip("fsspec") - # passing 0 for the port will let the system find an unused port - with http.server.HTTPServer(("localhost", 0), responder) as server: - server_thread = threading.Thread(target=server.serve_forever) - server_thread.start() + read_method = wait_until_ready(read_method) + if parquet_engine is None: + df_http = read_method(f"http://localhost:{responder}") + else: + df_http = read_method(f"http://localhost:{responder}", engine=parquet_engine) - port = server.server_port - if parquet_engine is None: - df_http = read_method(f"http://localhost:{port}") - else: - df_http = read_method(f"http://localhost:{port}", engine=parquet_engine) - server.shutdown() - server.server_close() - server_thread.join() assert not df_http.empty @@ -237,6 +283,7 @@ def test_server_and_default_headers(responder, read_method, parquet_engine): (GzippedCSVUserAgentResponder, pd.read_csv, None), (GzippedJSONUserAgentResponder, pd.read_json, None), ], + indirect=["responder"], ) def test_server_and_custom_headers(responder, read_method, parquet_engine): if parquet_engine is not None: @@ -247,27 +294,18 @@ def test_server_and_custom_headers(responder, read_method, parquet_engine): custom_user_agent = "Super Cool One" df_true = pd.DataFrame({"header": [custom_user_agent]}) - # passing 0 for the port will let the system find an unused port - with http.server.HTTPServer(("localhost", 0), responder) as server: - server_thread = threading.Thread(target=server.serve_forever) - server_thread.start() - - port = server.server_port - if parquet_engine is None: - df_http = read_method( - f"http://localhost:{port}", - storage_options={"User-Agent": custom_user_agent}, - ) - else: - df_http = read_method( - f"http://localhost:{port}", - storage_options={"User-Agent": custom_user_agent}, - engine=parquet_engine, - ) - server.shutdown() - - server.server_close() - server_thread.join() + read_method = wait_until_ready(read_method) + if parquet_engine is None: + df_http = read_method( + f"http://localhost:{responder}", + storage_options={"User-Agent": custom_user_agent}, + ) + else: + df_http = read_method( + f"http://localhost:{responder}", + storage_options={"User-Agent": custom_user_agent}, + engine=parquet_engine, + ) tm.assert_frame_equal(df_true, df_http) @@ -277,6 +315,7 @@ def test_server_and_custom_headers(responder, read_method, parquet_engine): [ (AllHeaderCSVResponder, pd.read_csv), ], + indirect=["responder"], ) def test_server_and_all_custom_headers(responder, read_method): custom_user_agent = "Super Cool One" @@ -285,20 +324,11 @@ def test_server_and_all_custom_headers(responder, read_method): "User-Agent": custom_user_agent, "Auth": custom_auth_token, } - - # passing 0 for the port will let the system find an unused port - with http.server.HTTPServer(("localhost", 0), responder) as server: - server_thread = threading.Thread(target=server.serve_forever) - server_thread.start() - - port = server.server_port - df_http = read_method( - f"http://localhost:{port}", - storage_options=storage_options, - ) - server.shutdown() - server.server_close() - server_thread.join() + read_method = wait_until_ready(read_method) + df_http = read_method( + f"http://localhost:{responder}", + storage_options=storage_options, + ) df_http = df_http[df_http["0"].isin(storage_options.keys())] df_http = df_http.sort_values(["0"]).reset_index() diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index b8d146c597d2c4..eea6c535f12b62 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -1311,7 +1311,12 @@ def test_filename_and_suffix_comp(parser, comp, compfile): def test_unsuported_compression(datapath, parser): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: - geom_df.to_xml(path, parser=parser, compression="7z") + # Argument "compression" to "to_xml" of "DataFrame" has incompatible type + # "Literal['7z']"; expected "Union[Literal['infer'], Literal['gzip'], + # Literal['bz2'], Literal['zip'], Literal['xz'], Dict[str, Any], None]" + geom_df.to_xml( + path, parser=parser, compression="7z" # type: ignore[arg-type] + ) # STORAGE OPTIONS diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 70a75bd34be711..2e718073c41749 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1069,7 +1069,10 @@ def test_wrong_compression_zip(parser, comp): def test_unsuported_compression(datapath, parser): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: - read_xml(path, parser=parser, compression="7z") + # error: Argument "compression" to "read_xml" has incompatible type + # "Literal['7z']"; expected "Union[Literal['infer'], Literal['gzip'], + # Literal['bz2'], Literal['zip'], Literal['xz'], Dict[str, Any], None]" + read_xml(path, parser=parser, compression="7z") # type: ignore[arg-type] # STORAGE OPTIONS diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 75f2dcacf244d5..fe8620ef76c4b7 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -10,10 +10,6 @@ import pandas._config.config as cf -from pandas.compat import ( - is_platform_windows, - np_datetime64_compat, -) import pandas.util._test_decorators as td from pandas import ( @@ -92,11 +88,6 @@ def test_registering_no_warning(self): ax.plot(s.index, s.values) plt.close() - @pytest.mark.xfail( - is_platform_windows(), - reason="Getting two warnings intermittently, see GH#37746", - strict=False, - ) def test_pandas_plots_register(self): plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) @@ -193,21 +184,14 @@ def test_conversion(self): assert rs == xp # also testing datetime64 dtype (GH8614) - rs = self.dtc.convert(np_datetime64_compat("2012-01-01"), None, None) + rs = self.dtc.convert("2012-01-01", None, None) assert rs == xp - rs = self.dtc.convert( - np_datetime64_compat("2012-01-01 00:00:00+0000"), None, None - ) + rs = self.dtc.convert("2012-01-01 00:00:00+0000", None, None) assert rs == xp rs = self.dtc.convert( - np.array( - [ - np_datetime64_compat("2012-01-01 00:00:00+0000"), - np_datetime64_compat("2012-01-02 00:00:00+0000"), - ] - ), + np.array(["2012-01-01 00:00:00+0000", "2012-01-02 00:00:00+0000"]), None, None, ) @@ -342,20 +326,16 @@ def test_conversion(self): rs = self.pc.convert(Timestamp("2012-1-1"), None, self.axis) assert rs == xp - rs = self.pc.convert(np_datetime64_compat("2012-01-01"), None, self.axis) + rs = self.pc.convert("2012-01-01", None, self.axis) assert rs == xp - rs = self.pc.convert( - np_datetime64_compat("2012-01-01 00:00:00+0000"), None, self.axis - ) + rs = self.pc.convert("2012-01-01 00:00:00+0000", None, self.axis) assert rs == xp rs = self.pc.convert( np.array( - [ - np_datetime64_compat("2012-01-01 00:00:00+0000"), - np_datetime64_compat("2012-01-02 00:00:00+0000"), - ] + ["2012-01-01 00:00:00+0000", "2012-01-02 00:00:00+0000"], + dtype="datetime64[ns]", ), None, self.axis, diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index f74cab9ed04da0..44113d1e217fd9 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -179,12 +179,7 @@ def check_format_of_first_point(ax, expected_string): first_line = ax.get_lines()[0] first_x = first_line.get_xdata()[0].ordinal first_y = first_line.get_ydata()[0] - try: - assert expected_string == ax.format_coord(first_x, first_y) - except (ValueError): - pytest.skip( - "skipping test because issue forming test comparison GH7664" - ) + assert expected_string == ax.format_coord(first_x, first_y) annual = Series(1, index=date_range("2014-01-01", periods=3, freq="A-DEC")) _, ax = self.plt.subplots() diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index 76320767a6b01d..997f5abe120782 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas.compat import is_platform_windows import pandas.util._test_decorators as td from pandas import ( @@ -20,11 +19,6 @@ @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): - @pytest.mark.xfail( - is_platform_windows(), - reason="Looks like LinePlot._is_ts_plot is wrong", - strict=False, - ) def test_series_groupby_plotting_nominally_works(self): n = 10 weight = Series(np.random.normal(166, 20, size=n)) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 42a5df1f65affd..44fc6042ebaab9 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -720,7 +720,7 @@ def test_custom_business_day_freq(self): _check_plot_works(s.plot) - @pytest.mark.xfail(reason="TODO: reason?") + @pytest.mark.xfail(reason="GH#24426") def test_plot_accessor_updates_on_inplace(self): ser = Series([1, 2, 3, 4]) _, ax = self.plt.subplots() diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index c2daac74fb4ebf..49488e823d6628 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -570,7 +570,9 @@ def test_sum_inf(self): res = nanops.nansum(arr, axis=1) assert np.isinf(res).all() - @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean", "object"]) + @pytest.mark.parametrize( + "dtype", ["float64", "Float32", "Int64", "boolean", "object"] + ) @pytest.mark.parametrize("use_bottleneck", [True, False]) @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) def test_empty(self, method, unit, use_bottleneck, dtype): diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 0d1bb05c275647..2f1ae5df0d5d4b 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -43,15 +43,14 @@ def test_dt64_mean(self, tz_naive_fixture, box): assert obj.mean(skipna=False) is pd.NaT @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) - def test_period_mean(self, box): + @pytest.mark.parametrize("freq", ["S", "H", "D", "W", "B"]) + def test_period_mean(self, box, freq): # GH#24757 dti = pd.date_range("2001-01-01", periods=11) # shuffle so that we are not just working with monotone-increasing dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) - # use hourly frequency to avoid rounding errors in expected results - # TODO: flesh this out with different frequencies - parr = dti._data.to_period("H") + parr = dti._data.to_period(freq) obj = box(parr) with pytest.raises(TypeError, match="ambiguous"): obj.mean() diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 34e8e2ac3e84a6..6a039e6e22f60f 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1828,3 +1828,27 @@ def test_resample_aggregate_functions_min_count(func): index=DatetimeIndex(["2020-03-31"], dtype="datetime64[ns]", freq="Q-DEC"), ) tm.assert_series_equal(result, expected) + + +def test_resample_unsigned_int(any_unsigned_int_numpy_dtype): + # gh-43329 + df = DataFrame( + index=date_range(start="2000-01-01", end="2000-01-03 23", freq="12H"), + columns=["x"], + data=[0, 1, 0] * 2, + dtype=any_unsigned_int_numpy_dtype, + ) + df = df.loc[(df.index < "2000-01-02") | (df.index > "2000-01-03"), :] + + if any_unsigned_int_numpy_dtype == "uint64": + with pytest.raises(RuntimeError, match="empty group with uint64_t"): + result = df.resample("D").max() + else: + result = df.resample("D").max() + + expected = DataFrame( + [1, np.nan, 0], + columns=["x"], + index=date_range(start="2000-01-01", end="2000-01-03 23", freq="D"), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index b8b254e786194a..d5d86465dd91b0 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -10,48 +10,50 @@ ) import pandas._testing as tm +dt_data = [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), +] +tz_data = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), +] +td_data = [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), +] +period_data = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), +] +data_dict = { + "bool": [True, False, True], + "int64": [1, 2, 3], + "float64": [1.1, np.nan, 3.3], + "category": Categorical(["X", "Y", "Z"]), + "object": ["a", "b", "c"], + "datetime64[ns]": dt_data, + "datetime64[ns, US/Eastern]": tz_data, + "timedelta64[ns]": td_data, + "period[M]": period_data, +} + class TestConcatAppendCommon: """ Test common dtype coercion rules between concat and append. """ - def setup_method(self, method): - - dt_data = [ - pd.Timestamp("2011-01-01"), - pd.Timestamp("2011-01-02"), - pd.Timestamp("2011-01-03"), - ] - tz_data = [ - pd.Timestamp("2011-01-01", tz="US/Eastern"), - pd.Timestamp("2011-01-02", tz="US/Eastern"), - pd.Timestamp("2011-01-03", tz="US/Eastern"), - ] - - td_data = [ - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - ] - - period_data = [ - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - pd.Period("2011-03", freq="M"), - ] - - self.data = { - "bool": [True, False, True], - "int64": [1, 2, 3], - "float64": [1.1, np.nan, 3.3], - "category": Categorical(["X", "Y", "Z"]), - "object": ["a", "b", "c"], - "datetime64[ns]": dt_data, - "datetime64[ns, US/Eastern]": tz_data, - "timedelta64[ns]": td_data, - "period[M]": period_data, - } + @pytest.fixture(params=sorted(data_dict.keys())) + def item(self, request): + key = request.param + return key, data_dict[key] + + item2 = item def _check_expected_dtype(self, obj, label): """ @@ -71,192 +73,189 @@ def _check_expected_dtype(self, obj, label): else: raise ValueError - def test_dtypes(self): + def test_dtypes(self, item): # to confirm test case covers intended dtypes - for typ, vals in self.data.items(): - self._check_expected_dtype(Index(vals), typ) - self._check_expected_dtype(Series(vals), typ) + typ, vals = item + self._check_expected_dtype(Index(vals), typ) + self._check_expected_dtype(Series(vals), typ) - def test_concatlike_same_dtypes(self): + def test_concatlike_same_dtypes(self, item): # GH 13660 - for typ1, vals1 in self.data.items(): + typ1, vals1 = item - vals2 = vals1 - vals3 = vals1 + vals2 = vals1 + vals3 = vals1 - if typ1 == "category": - exp_data = Categorical(list(vals1) + list(vals2)) - exp_data3 = Categorical(list(vals1) + list(vals2) + list(vals3)) - else: - exp_data = vals1 + vals2 - exp_data3 = vals1 + vals2 + vals3 - - # ----- Index ----- # - - # index.append - res = Index(vals1).append(Index(vals2)) - exp = Index(exp_data) - tm.assert_index_equal(res, exp) - - # 3 elements - res = Index(vals1).append([Index(vals2), Index(vals3)]) - exp = Index(exp_data3) - tm.assert_index_equal(res, exp) - - # index.append name mismatch - i1 = Index(vals1, name="x") - i2 = Index(vals2, name="y") - res = i1.append(i2) - exp = Index(exp_data) - tm.assert_index_equal(res, exp) - - # index.append name match - i1 = Index(vals1, name="x") - i2 = Index(vals2, name="x") - res = i1.append(i2) - exp = Index(exp_data, name="x") - tm.assert_index_equal(res, exp) - - # cannot append non-index - with pytest.raises(TypeError, match="all inputs must be Index"): - Index(vals1).append(vals2) - - with pytest.raises(TypeError, match="all inputs must be Index"): - Index(vals1).append([Index(vals2), vals3]) - - # ----- Series ----- # - - # series.append - res = Series(vals1).append(Series(vals2), ignore_index=True) - exp = Series(exp_data) - tm.assert_series_equal(res, exp, check_index_type=True) - - # concat - res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # 3 elements - res = Series(vals1).append( - [Series(vals2), Series(vals3)], ignore_index=True - ) - exp = Series(exp_data3) - tm.assert_series_equal(res, exp) - - res = pd.concat( - [Series(vals1), Series(vals2), Series(vals3)], - ignore_index=True, - ) - tm.assert_series_equal(res, exp) - - # name mismatch - s1 = Series(vals1, name="x") - s2 = Series(vals2, name="y") - res = s1.append(s2, ignore_index=True) - exp = Series(exp_data) - tm.assert_series_equal(res, exp, check_index_type=True) - - res = pd.concat([s1, s2], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # name match - s1 = Series(vals1, name="x") - s2 = Series(vals2, name="x") - res = s1.append(s2, ignore_index=True) - exp = Series(exp_data, name="x") - tm.assert_series_equal(res, exp, check_index_type=True) - - res = pd.concat([s1, s2], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # cannot append non-index - msg = ( - r"cannot concatenate object of type '.+'; " - "only Series and DataFrame objs are valid" - ) - with pytest.raises(TypeError, match=msg): - Series(vals1).append(vals2) - - with pytest.raises(TypeError, match=msg): - Series(vals1).append([Series(vals2), vals3]) - - with pytest.raises(TypeError, match=msg): - pd.concat([Series(vals1), vals2]) - - with pytest.raises(TypeError, match=msg): - pd.concat([Series(vals1), Series(vals2), vals3]) - - def test_concatlike_dtypes_coercion(self): + if typ1 == "category": + exp_data = Categorical(list(vals1) + list(vals2)) + exp_data3 = Categorical(list(vals1) + list(vals2) + list(vals3)) + else: + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = Index(vals1).append(Index(vals2)) + exp = Index(exp_data) + tm.assert_index_equal(res, exp) + + # 3 elements + res = Index(vals1).append([Index(vals2), Index(vals3)]) + exp = Index(exp_data3) + tm.assert_index_equal(res, exp) + + # index.append name mismatch + i1 = Index(vals1, name="x") + i2 = Index(vals2, name="y") + res = i1.append(i2) + exp = Index(exp_data) + tm.assert_index_equal(res, exp) + + # index.append name match + i1 = Index(vals1, name="x") + i2 = Index(vals2, name="x") + res = i1.append(i2) + exp = Index(exp_data, name="x") + tm.assert_index_equal(res, exp) + + # cannot append non-index + with pytest.raises(TypeError, match="all inputs must be Index"): + Index(vals1).append(vals2) + + with pytest.raises(TypeError, match="all inputs must be Index"): + Index(vals1).append([Index(vals2), vals3]) + + # ----- Series ----- # + + # series.append + res = Series(vals1).append(Series(vals2), ignore_index=True) + exp = Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = Series(vals1).append([Series(vals2), Series(vals3)], ignore_index=True) + exp = Series(exp_data3) + tm.assert_series_equal(res, exp) + + res = pd.concat( + [Series(vals1), Series(vals2), Series(vals3)], + ignore_index=True, + ) + tm.assert_series_equal(res, exp) + + # name mismatch + s1 = Series(vals1, name="x") + s2 = Series(vals2, name="y") + res = s1.append(s2, ignore_index=True) + exp = Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # name match + s1 = Series(vals1, name="x") + s2 = Series(vals2, name="x") + res = s1.append(s2, ignore_index=True) + exp = Series(exp_data, name="x") + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # cannot append non-index + msg = ( + r"cannot concatenate object of type '.+'; " + "only Series and DataFrame objs are valid" + ) + with pytest.raises(TypeError, match=msg): + Series(vals1).append(vals2) + + with pytest.raises(TypeError, match=msg): + Series(vals1).append([Series(vals2), vals3]) + + with pytest.raises(TypeError, match=msg): + pd.concat([Series(vals1), vals2]) + + with pytest.raises(TypeError, match=msg): + pd.concat([Series(vals1), Series(vals2), vals3]) + + def test_concatlike_dtypes_coercion(self, item, item2): # GH 13660 - for typ1, vals1 in self.data.items(): - for typ2, vals2 in self.data.items(): - - vals3 = vals2 - - # basically infer - exp_index_dtype = None - exp_series_dtype = None - - if typ1 == typ2: - # same dtype is tested in test_concatlike_same_dtypes - continue - elif typ1 == "category" or typ2 == "category": - # TODO: suspicious - continue - - # specify expected dtype - if typ1 == "bool" and typ2 in ("int64", "float64"): - # series coerces to numeric based on numpy rule - # index doesn't because bool is object dtype - exp_series_dtype = typ2 - elif typ2 == "bool" and typ1 in ("int64", "float64"): - exp_series_dtype = typ1 - elif ( - typ1 == "datetime64[ns, US/Eastern]" - or typ2 == "datetime64[ns, US/Eastern]" - or typ1 == "timedelta64[ns]" - or typ2 == "timedelta64[ns]" - ): - exp_index_dtype = object - exp_series_dtype = object - - exp_data = vals1 + vals2 - exp_data3 = vals1 + vals2 + vals3 - - # ----- Index ----- # - - # index.append - res = Index(vals1).append(Index(vals2)) - exp = Index(exp_data, dtype=exp_index_dtype) - tm.assert_index_equal(res, exp) - - # 3 elements - res = Index(vals1).append([Index(vals2), Index(vals3)]) - exp = Index(exp_data3, dtype=exp_index_dtype) - tm.assert_index_equal(res, exp) - - # ----- Series ----- # - - # series.append - res = Series(vals1).append(Series(vals2), ignore_index=True) - exp = Series(exp_data, dtype=exp_series_dtype) - tm.assert_series_equal(res, exp, check_index_type=True) - - # concat - res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # 3 elements - res = Series(vals1).append( - [Series(vals2), Series(vals3)], ignore_index=True - ) - exp = Series(exp_data3, dtype=exp_series_dtype) - tm.assert_series_equal(res, exp) - - res = pd.concat( - [Series(vals1), Series(vals2), Series(vals3)], - ignore_index=True, - ) - tm.assert_series_equal(res, exp) + typ1, vals1 = item + typ2, vals2 = item2 + + vals3 = vals2 + + # basically infer + exp_index_dtype = None + exp_series_dtype = None + + if typ1 == typ2: + # same dtype is tested in test_concatlike_same_dtypes + return + elif typ1 == "category" or typ2 == "category": + # The `vals1 + vals2` below fails bc one of these is a Categorical + # instead of a list; we have separate dedicated tests for categorical + return + + # specify expected dtype + if typ1 == "bool" and typ2 in ("int64", "float64"): + # series coerces to numeric based on numpy rule + # index doesn't because bool is object dtype + exp_series_dtype = typ2 + elif typ2 == "bool" and typ1 in ("int64", "float64"): + exp_series_dtype = typ1 + elif ( + typ1 == "datetime64[ns, US/Eastern]" + or typ2 == "datetime64[ns, US/Eastern]" + or typ1 == "timedelta64[ns]" + or typ2 == "timedelta64[ns]" + ): + exp_index_dtype = object + exp_series_dtype = object + + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = Index(vals1).append(Index(vals2)) + exp = Index(exp_data, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # 3 elements + res = Index(vals1).append([Index(vals2), Index(vals3)]) + exp = Index(exp_data3, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # ----- Series ----- # + + # series.append + res = Series(vals1).append(Series(vals2), ignore_index=True) + exp = Series(exp_data, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = Series(vals1).append([Series(vals2), Series(vals3)], ignore_index=True) + exp = Series(exp_data3, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp) + + res = pd.concat( + [Series(vals1), Series(vals2), Series(vals3)], + ignore_index=True, + ) + tm.assert_series_equal(res, exp) def test_concatlike_common_coerce_to_pandas_object(self): # GH 13626 diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index c4fe16b43313a1..1af54a1d5cf4ad 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -528,3 +528,16 @@ def test_concat_timedelta64_block(): result = concat([df, df]) tm.assert_frame_equal(result.iloc[:10], df) tm.assert_frame_equal(result.iloc[10:], df) + + +def test_concat_multiindex_datetime_nat(): + # GH#44900 + left = DataFrame({"a": 1}, index=MultiIndex.from_tuples([(1, pd.NaT)])) + right = DataFrame( + {"b": 2}, index=MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)]) + ) + result = concat([left, right], axis="columns") + expected = DataFrame( + {"a": [1.0, np.nan], "b": 2}, MultiIndex.from_tuples([(1, pd.NaT), (2, pd.NaT)]) + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index a4d6a41c7eb503..35cf6703986644 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -257,3 +257,52 @@ def test_concat_multiindex_dfs_with_deepcopy(self): tm.assert_frame_equal(result_copy, expected) result_no_copy = concat(example_dict, names=["testname"]) tm.assert_frame_equal(result_no_copy, expected) + + @pytest.mark.parametrize( + "mi1_list", + [ + [["a"], range(2)], + [["b"], np.arange(2.0, 4.0)], + [["c"], ["A", "B"]], + [["d"], pd.date_range(start="2017", end="2018", periods=2)], + ], + ) + @pytest.mark.parametrize( + "mi2_list", + [ + [["a"], range(2)], + [["b"], np.arange(2.0, 4.0)], + [["c"], ["A", "B"]], + [["d"], pd.date_range(start="2017", end="2018", periods=2)], + ], + ) + def test_concat_with_various_multiindex_dtypes( + self, mi1_list: list, mi2_list: list + ): + # GitHub #23478 + mi1 = MultiIndex.from_product(mi1_list) + mi2 = MultiIndex.from_product(mi2_list) + + df1 = DataFrame(np.zeros((1, len(mi1))), columns=mi1) + df2 = DataFrame(np.zeros((1, len(mi2))), columns=mi2) + + if mi1_list[0] == mi2_list[0]: + expected_mi = MultiIndex( + levels=[mi1_list[0], list(mi1_list[1])], + codes=[[0, 0, 0, 0], [0, 1, 0, 1]], + ) + else: + expected_mi = MultiIndex( + levels=[ + mi1_list[0] + mi2_list[0], + list(mi1_list[1]) + list(mi2_list[1]), + ], + codes=[[0, 0, 1, 1], [0, 1, 2, 3]], + ) + + expected_df = DataFrame(np.zeros((1, len(expected_mi))), columns=expected_mi) + + with tm.assert_produces_warning(None): + result_df = concat((df1, df2), axis=1) + + tm.assert_frame_equal(expected_df, result_df) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 371a7fed543e4d..2f9f31ebb0485a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -597,7 +597,7 @@ def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na): tm.assert_frame_equal(actual, expected) def test_merge_nosort(self): - # GH#2098, TODO: anything to do? + # GH#2098 d = { "var1": np.random.randint(0, 10, size=10), diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index f35033115d2fc4..7d437f3b472742 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -26,7 +26,6 @@ dateutil_gettz, maybe_get_tz, ) -from pandas.compat import np_datetime64_compat import pandas as pd from pandas import ( @@ -92,14 +91,14 @@ def test_construction(self): expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") assert i1 == expected - expected = Period(np_datetime64_compat("2007-01-01 09:00:00.001Z"), freq="L") + expected = Period("2007-01-01 09:00:00.001", freq="L") assert i1 == expected i1 = Period("2007-01-01 09:00:00.00101") expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") assert i1 == expected - expected = Period(np_datetime64_compat("2007-01-01 09:00:00.00101Z"), freq="U") + expected = Period("2007-01-01 09:00:00.00101", freq="U") assert i1 == expected msg = "Must supply freq for ordinal value" @@ -190,8 +189,8 @@ def test_construction_month(self): i1 = Period(date(2007, 1, 1), freq="M") i2 = Period(datetime(2007, 1, 1), freq="M") i3 = Period(np.datetime64("2007-01-01"), freq="M") - i4 = Period(np_datetime64_compat("2007-01-01 00:00:00Z"), freq="M") - i5 = Period(np_datetime64_compat("2007-01-01 00:00:00.000Z"), freq="M") + i4 = Period("2007-01-01 00:00:00", freq="M") + i5 = Period("2007-01-01 00:00:00.000", freq="M") assert i1 == i2 assert i1 == i3 assert i1 == i4 @@ -245,8 +244,8 @@ def test_period_constructor_offsets(self): i1 = Period(date(2007, 1, 1), freq="M") i2 = Period(datetime(2007, 1, 1), freq="M") i3 = Period(np.datetime64("2007-01-01"), freq="M") - i4 = Period(np_datetime64_compat("2007-01-01 00:00:00Z"), freq="M") - i5 = Period(np_datetime64_compat("2007-01-01 00:00:00.000Z"), freq="M") + i4 = Period("2007-01-01 00:00:00", freq="M") + i5 = Period("2007-01-01 00:00:00.000", freq="M") assert i1 == i2 assert i1 == i3 assert i1 == i4 @@ -256,14 +255,14 @@ def test_period_constructor_offsets(self): expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") assert i1 == expected - expected = Period(np_datetime64_compat("2007-01-01 09:00:00.001Z"), freq="L") + expected = Period("2007-01-01 09:00:00.001", freq="L") assert i1 == expected i1 = Period("2007-01-01 09:00:00.00101") expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") assert i1 == expected - expected = Period(np_datetime64_compat("2007-01-01 09:00:00.00101Z"), freq="U") + expected = Period("2007-01-01 09:00:00.00101", freq="U") assert i1 == expected def test_invalid_arguments(self): diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index f2c2985827a4f5..cfb3b504f7a791 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -38,7 +38,7 @@ "nat,idx", [ (Timestamp("NaT"), DatetimeArray), - (Timedelta("NaT"), TimedeltaIndex), + (Timedelta("NaT"), TimedeltaArray), (Period("NaT", freq="M"), PeriodArray), ], ) @@ -68,7 +68,7 @@ def test_nat_fields(nat, idx): def test_nat_vector_field_access(): idx = DatetimeIndex(["1/1/2000", None, None, "1/4/2000"]) - for field in DatetimeIndex._field_ops: + for field in DatetimeArray._field_ops: # weekday is a property of DTI, but a method # on NaT/Timestamp for compat with datetime if field == "weekday": diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index b3abec6b9761fb..03ba6b12599a68 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -22,7 +22,6 @@ dateutil_gettz as gettz, get_timezone, ) -from pandas.compat import np_datetime64_compat import pandas.util._test_decorators as td from pandas import ( @@ -492,7 +491,7 @@ def test_nanosecond_timestamp(self): assert t.value == expected assert t.nanosecond == 5 - t = Timestamp(np_datetime64_compat("2011-01-01 00:00:00.000000005Z")) + t = Timestamp("2011-01-01 00:00:00.000000005") assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')" assert t.value == expected assert t.nanosecond == 5 @@ -508,7 +507,7 @@ def test_nanosecond_timestamp(self): assert t.value == expected assert t.nanosecond == 10 - t = Timestamp(np_datetime64_compat("2011-01-01 00:00:00.000000010Z")) + t = Timestamp("2011-01-01 00:00:00.000000010") assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000010')" assert t.value == expected assert t.nanosecond == 10 diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py index 9dfb36a71c8610..1a038839a67c98 100644 --- a/pandas/tests/series/accessors/test_cat_accessor.py +++ b/pandas/tests/series/accessors/test_cat_accessor.py @@ -14,11 +14,6 @@ timedelta_range, ) import pandas._testing as tm -from pandas.core.arrays import ( - DatetimeArray, - PeriodArray, - TimedeltaArray, -) from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.indexes.accessors import Properties @@ -163,86 +158,84 @@ def test_categorical_delegations(self): ) tm.assert_series_equal(result, expected) - def test_dt_accessor_api_for_categorical(self): + @pytest.mark.parametrize( + "idx", + [ + date_range("1/1/2015", periods=5), + date_range("1/1/2015", periods=5, tz="MET"), + period_range("1/1/2015", freq="D", periods=5), + timedelta_range("1 days", "10 days"), + ], + ) + def test_dt_accessor_api_for_categorical(self, idx): # https://github.com/pandas-dev/pandas/issues/10661 - s_dr = Series(date_range("1/1/2015", periods=5, tz="MET")) - c_dr = s_dr.astype("category") - - s_pr = Series(period_range("1/1/2015", freq="D", periods=5)) - c_pr = s_pr.astype("category") - - s_tdr = Series(timedelta_range("1 days", "10 days")) - c_tdr = s_tdr.astype("category") + ser = Series(idx) + cat = ser.astype("category") # only testing field (like .day) # and bool (is_month_start) - get_ops = lambda x: x._datetimelike_ops - - test_data = [ - ("Datetime", get_ops(DatetimeArray), s_dr, c_dr), - ("Period", get_ops(PeriodArray), s_pr, c_pr), - ("Timedelta", get_ops(TimedeltaArray), s_tdr, c_tdr), - ] + attr_names = type(ser._values)._datetimelike_ops - assert isinstance(c_dr.dt, Properties) + assert isinstance(cat.dt, Properties) special_func_defs = [ ("strftime", ("%Y-%m-%d",), {}), - ("tz_convert", ("EST",), {}), ("round", ("D",), {}), ("floor", ("D",), {}), ("ceil", ("D",), {}), ("asfreq", ("D",), {}), - # FIXME: don't leave commented-out - # ('tz_localize', ("UTC",), {}), ] + if idx.dtype == "M8[ns]": + # exclude dt64tz since that is already localized and would raise + tup = ("tz_localize", ("UTC",), {}) + special_func_defs.append(tup) + elif idx.dtype.kind == "M": + # exclude dt64 since that is not localized so would raise + tup = ("tz_convert", ("EST",), {}) + special_func_defs.append(tup) + _special_func_names = [f[0] for f in special_func_defs] - # the series is already localized - _ignore_names = ["tz_localize", "components"] - - for name, attr_names, s, c in test_data: - func_names = [ - f - for f in dir(s.dt) - if not ( - f.startswith("_") - or f in attr_names - or f in _special_func_names - or f in _ignore_names - ) - ] - - func_defs = [(f, (), {}) for f in func_names] - for f_def in special_func_defs: - if f_def[0] in dir(s.dt): - func_defs.append(f_def) - - for func, args, kwargs in func_defs: - with warnings.catch_warnings(): - if func == "to_period": - # dropping TZ - warnings.simplefilter("ignore", UserWarning) - res = getattr(c.dt, func)(*args, **kwargs) - exp = getattr(s.dt, func)(*args, **kwargs) - - tm.assert_equal(res, exp) - - for attr in attr_names: - if attr in ["week", "weekofyear"]: - # GH#33595 Deprecate week and weekofyear - continue - res = getattr(c.dt, attr) - exp = getattr(s.dt, attr) - - if isinstance(res, DataFrame): - tm.assert_frame_equal(res, exp) - elif isinstance(res, Series): - tm.assert_series_equal(res, exp) - else: - tm.assert_almost_equal(res, exp) + _ignore_names = ["components", "tz_localize", "tz_convert"] + + func_names = [ + fname + for fname in dir(ser.dt) + if not ( + fname.startswith("_") + or fname in attr_names + or fname in _special_func_names + or fname in _ignore_names + ) + ] + + func_defs = [(fname, (), {}) for fname in func_names] + + for f_def in special_func_defs: + if f_def[0] in dir(ser.dt): + func_defs.append(f_def) + + for func, args, kwargs in func_defs: + with warnings.catch_warnings(): + if func == "to_period": + # dropping TZ + warnings.simplefilter("ignore", UserWarning) + res = getattr(cat.dt, func)(*args, **kwargs) + exp = getattr(ser.dt, func)(*args, **kwargs) + + tm.assert_equal(res, exp) + + for attr in attr_names: + if attr in ["week", "weekofyear"]: + # GH#33595 Deprecate week and weekofyear + continue + res = getattr(cat.dt, attr) + exp = getattr(ser.dt, attr) + + tm.assert_equal(res, exp) + def test_dt_accessor_api_for_categorical_invalid(self): invalid = Series([1, 2, 3]).astype("category") msg = "Can only use .dt accessor with datetimelike" diff --git a/pandas/tests/series/indexing/test_get.py b/pandas/tests/series/indexing/test_get.py index 23db91e25125da..e8034bd4f71602 100644 --- a/pandas/tests/series/indexing/test_get.py +++ b/pandas/tests/series/indexing/test_get.py @@ -158,8 +158,7 @@ def test_get_with_default(): "arr", [np.random.randn(10), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern")], ) -def test_get2(arr): - # TODO: better name, possibly split +def test_get_with_ea(arr): # GH#21260 ser = Series(arr, index=[2 * i for i in range(len(arr))]) assert ser.get(4) == ser.iloc[2] diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 8d0fba478bf661..31c21e123a0dec 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -262,48 +262,6 @@ def test_preserve_refs(datetime_series): assert not np.isnan(datetime_series[10]) -def test_cast_on_putmask(): - # GH 2746 - - # need to upcast - s = Series([1, 2], index=[1, 2], dtype="int64") - s[[True, False]] = Series([0], index=[1], dtype="int64") - expected = Series([0, 2], index=[1, 2], dtype="int64") - - tm.assert_series_equal(s, expected) - - -def test_type_promote_putmask(): - # GH8387: test that changing types does not break alignment - ts = Series(np.random.randn(100), index=np.arange(100, 0, -1)).round(5) - left, mask = ts.copy(), ts > 0 - right = ts[mask].copy().map(str) - left[mask] = right - tm.assert_series_equal(left, ts.map(lambda t: str(t) if t > 0 else t)) - - -def test_setitem_mask_promote_strs(): - - ser = Series([0, 1, 2, 0]) - mask = ser > 0 - ser2 = ser[mask].map(str) - ser[mask] = ser2 - - expected = Series([0, "1", "2", 0]) - tm.assert_series_equal(ser, expected) - - -def test_setitem_mask_promote(): - - ser = Series([0, "foo", "bar", 0]) - mask = Series([False, True, True, False]) - ser2 = ser[mask] - ser[mask] = ser2 - - expected = Series([0, "foo", "bar", 0]) - tm.assert_series_equal(ser, expected) - - def test_multilevel_preserve_name(lexsorted_two_level_string_multiindex, indexer_sl): index = lexsorted_two_level_string_multiindex ser = Series(np.random.randn(len(index)), index=index, name="sth") @@ -317,13 +275,6 @@ def test_multilevel_preserve_name(lexsorted_two_level_string_multiindex, indexer """ -def test_slice_with_zero_step_raises(index, frame_or_series, indexer_sli): - ts = frame_or_series(np.arange(len(index)), index=index) - - with pytest.raises(ValueError, match="slice step cannot be zero"): - indexer_sli(ts)[::0] - - @pytest.mark.parametrize( "index", [ diff --git a/pandas/tests/series/indexing/test_mask.py b/pandas/tests/series/indexing/test_mask.py index dc4fb530dbb527..28235a8918e3f3 100644 --- a/pandas/tests/series/indexing/test_mask.py +++ b/pandas/tests/series/indexing/test_mask.py @@ -37,15 +37,19 @@ def test_mask(): with pytest.raises(ValueError, match=msg): s.mask(cond[:3].values, -s) + +def test_mask_casts(): # dtype changes - s = Series([1, 2, 3, 4]) - result = s.mask(s > 2, np.nan) + ser = Series([1, 2, 3, 4]) + result = ser.mask(ser > 2, np.nan) expected = Series([1, 2, np.nan, np.nan]) tm.assert_series_equal(result, expected) + +def test_mask_casts2(): # see gh-21891 - s = Series([1, 2]) - res = s.mask([True, False]) + ser = Series([1, 2]) + res = ser.mask([True, False]) exp = Series([np.nan, 2]) tm.assert_series_equal(res, exp) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 265b9e1395b238..3e8e1b3f436ec7 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -107,7 +107,7 @@ def test_setitem_with_tz(self, tz, indexer_sli): tm.assert_series_equal(ser, exp) def test_setitem_with_tz_dst(self, indexer_sli): - # GH XXX TODO: fill in GH ref + # GH#14146 trouble setting values near DST boundary tz = "US/Eastern" orig = Series(date_range("2016-11-06", freq="H", periods=3, tz=tz)) assert orig.dtype == f"datetime64[ns, {tz}]" @@ -234,6 +234,43 @@ def test_setitem_multiindex_slice(self, indexer_sli): class TestSetitemBooleanMask: + def test_setitem_mask_cast(self): + # GH#2746 + # need to upcast + ser = Series([1, 2], index=[1, 2], dtype="int64") + ser[[True, False]] = Series([0], index=[1], dtype="int64") + expected = Series([0, 2], index=[1, 2], dtype="int64") + + tm.assert_series_equal(ser, expected) + + def test_setitem_mask_align_and_promote(self): + # GH#8387: test that changing types does not break alignment + ts = Series(np.random.randn(100), index=np.arange(100, 0, -1)).round(5) + mask = ts > 0 + left = ts.copy() + right = ts[mask].copy().map(str) + left[mask] = right + expected = ts.map(lambda t: str(t) if t > 0 else t) + tm.assert_series_equal(left, expected) + + def test_setitem_mask_promote_strs(self): + ser = Series([0, 1, 2, 0]) + mask = ser > 0 + ser2 = ser[mask].map(str) + ser[mask] = ser2 + + expected = Series([0, "1", "2", 0]) + tm.assert_series_equal(ser, expected) + + def test_setitem_mask_promote(self): + ser = Series([0, "foo", "bar", 0]) + mask = Series([False, True, True, False]) + ser2 = ser[mask] + ser[mask] = ser2 + + expected = Series([0, "foo", "bar", 0]) + tm.assert_series_equal(ser, expected) + def test_setitem_boolean(self, string_series): mask = string_series > string_series.median() @@ -658,11 +695,6 @@ def test_index_where(self, obj, key, expected, val, request): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if obj.dtype == bool: - msg = "Index/Series casting behavior inconsistent GH#38692" - mark = pytest.mark.xfail(reason=msg) - request.node.add_marker(mark) - res = Index(obj).where(~mask, val) tm.assert_index_equal(res, Index(expected)) diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 8ddcf07934e216..7ede868861ac0b 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -2,7 +2,6 @@ import pytest from pandas._libs.tslibs import IncompatibleFrequency -from pandas.compat import np_datetime64_compat from pandas import ( DatetimeIndex, @@ -28,7 +27,7 @@ def test_asof_nanosecond_index_access(self): # handle nanoseconds assert first_value == ser["2013-01-01 00:00:00.000000050+0000"] - expected_ts = np_datetime64_compat("2013-01-01 00:00:00.000000050+0000", "ns") + expected_ts = np.datetime64("2013-01-01 00:00:00.000000050", "ns") assert first_value == ser[Timestamp(expected_ts)] def test_basic(self): diff --git a/pandas/tests/series/methods/test_convert.py b/pandas/tests/series/methods/test_convert.py index 346f74d798de9d..4832780e6d0d36 100644 --- a/pandas/tests/series/methods/test_convert.py +++ b/pandas/tests/series/methods/test_convert.py @@ -32,6 +32,7 @@ def test_convert(self): results = ser._convert(timedelta=True) tm.assert_series_equal(results, ser) + def test_convert_numeric_strings_with_other_true_args(self): # test pass-through and non-conversion when other types selected ser = Series(["1.0", "2.0", "3.0"]) results = ser._convert(datetime=True, numeric=True, timedelta=True) @@ -40,6 +41,7 @@ def test_convert(self): results = ser._convert(True, False, True) tm.assert_series_equal(results, ser) + def test_convert_datetime_objects(self): ser = Series( [datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], dtype="O" ) @@ -49,6 +51,27 @@ def test_convert(self): results = ser._convert(datetime=False, numeric=True, timedelta=True) tm.assert_series_equal(results, ser) + def test_convert_datetime64(self): + # no-op if already dt64 dtype + ser = Series( + [ + datetime(2001, 1, 1, 0, 0), + datetime(2001, 1, 2, 0, 0), + datetime(2001, 1, 3, 0, 0), + ] + ) + + result = ser._convert(datetime=True) + expected = Series( + [Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103")], + dtype="M8[ns]", + ) + tm.assert_series_equal(result, expected) + + result = ser._convert(datetime=True) + tm.assert_series_equal(result, expected) + + def test_convert_timedeltas(self): td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) ser = Series([td, td], dtype="O") results = ser._convert(datetime=True, numeric=True, timedelta=True) @@ -57,6 +80,7 @@ def test_convert(self): results = ser._convert(True, True, False) tm.assert_series_equal(results, ser) + def test_convert_numeric_strings(self): ser = Series([1.0, 2, 3], index=["a", "b", "c"]) result = ser._convert(numeric=True) tm.assert_series_equal(result, ser) @@ -79,6 +103,7 @@ def test_convert(self): expected["a"] = np.nan tm.assert_series_equal(result, expected) + def test_convert_mixed_type_noop(self): # GH 4119, not converting a mixed type (e.g.floats and object) ser = Series([1, "na", 3, 4]) result = ser._convert(datetime=True, numeric=True) @@ -89,25 +114,7 @@ def test_convert(self): result = ser._convert(datetime=True, numeric=True) tm.assert_series_equal(result, expected) - # dates - ser = Series( - [ - datetime(2001, 1, 1, 0, 0), - datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0), - ] - ) - - result = ser._convert(datetime=True) - expected = Series( - [Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103")], - dtype="M8[ns]", - ) - tm.assert_series_equal(result, expected) - - result = ser._convert(datetime=True) - tm.assert_series_equal(result, expected) - + def test_convert_preserve_non_object(self): # preserve if non-object ser = Series([1], dtype="float32") result = ser._convert(datetime=True) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 088e10b0ba0704..43b210a50dab25 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -33,9 +33,9 @@ class TestSeriesRank: "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), } + @td.skip_if_no_scipy def test_rank(self, datetime_series): - pytest.importorskip("scipy.stats.special") - rankdata = pytest.importorskip("scipy.stats.rankdata") + from scipy.stats import rankdata datetime_series[::2] = np.nan datetime_series[:10][::3] = 4.0 @@ -280,9 +280,9 @@ def test_rank_desc_mix_nans_infs(self): exp = Series([3, np.nan, 1, 4, 2], dtype="float64") tm.assert_series_equal(result, exp) + @td.skip_if_no_scipy def test_rank_methods_series(self): - pytest.importorskip("scipy.stats.special") - rankdata = pytest.importorskip("scipy.stats.rankdata") + from scipy.stats import rankdata xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 28a0df99bb2b69..b7d6c498d1e0bb 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -5,10 +5,11 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import IntervalArray class TestSeriesReplace: - def test_replace(self, datetime_series): + def test_replace(self): N = 100 ser = pd.Series(np.random.randn(N)) ser[0:4] = np.nan @@ -58,6 +59,7 @@ def test_replace(self, datetime_series): assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() + def test_replace_nan_with_inf(self): ser = pd.Series([np.nan, 0, np.inf]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) @@ -67,6 +69,7 @@ def test_replace(self, datetime_series): filled[4] = 0 tm.assert_series_equal(ser.replace(np.inf, 0), filled) + def test_replace_listlike_value_listlike_target(self, datetime_series): ser = pd.Series(datetime_series.index) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) @@ -146,20 +149,21 @@ def test_replace_with_single_list(self): tm.assert_series_equal(s, ser) def test_replace_mixed_types(self): - s = pd.Series(np.arange(5), dtype="int64") + ser = pd.Series(np.arange(5), dtype="int64") def check_replace(to_rep, val, expected): - sc = s.copy() - r = s.replace(to_rep, val) + sc = ser.copy() + result = ser.replace(to_rep, val) return_value = sc.replace(to_rep, val, inplace=True) assert return_value is None - tm.assert_series_equal(expected, r) + tm.assert_series_equal(expected, result) tm.assert_series_equal(expected, sc) - # MUST upcast to float - e = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) + # 3.0 can still be held in our int64 series, so we do not upcast GH#44940 tr, v = [3], [3.0] - check_replace(tr, v, e) + check_replace(tr, v, ser) + # Note this matches what we get with the scalars 3 and 3.0 + check_replace(tr[0], v[0], ser) # MUST upcast to float e = pd.Series([0, 1, 2, 3.5, 4]) @@ -255,10 +259,10 @@ def test_replace2(self): assert (ser[20:30] == -1).all() def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): - # GH 32621 - s = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) - expected = pd.Series(["1", "2", np.nan]) - result = s.replace({"one": "1", "two": "2"}) + # GH 32621, GH#44940 + ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) + expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype) + result = ser.replace({"one": "1", "two": "2"}) tm.assert_series_equal(expected, result) def test_replace_with_empty_dictlike(self): @@ -303,17 +307,18 @@ def test_replace_mixed_types_with_string(self): "categorical, numeric", [ (pd.Categorical(["A"], categories=["A", "B"]), [1]), - (pd.Categorical(("A",), categories=["A", "B"]), [1]), - (pd.Categorical(("A", "B"), categories=["A", "B"]), [1, 2]), + (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), ], ) def test_replace_categorical(self, categorical, numeric): - # GH 24971 - # Do not check if dtypes are equal due to a known issue that - # Categorical.replace sometimes coerces to object (GH 23305) - s = pd.Series(categorical) - result = s.replace({"A": 1, "B": 2}) - expected = pd.Series(numeric) + # GH 24971, GH#23305 + ser = pd.Series(categorical) + result = ser.replace({"A": 1, "B": 2}) + expected = pd.Series(numeric).astype("category") + if 2 not in expected.cat.categories: + # i.e. categories should be [1, 2] even if there are no "B"s present + # GH#44940 + expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result) def test_replace_categorical_single(self): @@ -512,3 +517,90 @@ def test_pandas_replace_na(self): result = ser.replace(regex_mapping, regex=True) exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize( + "dtype, input_data, to_replace, expected_data", + [ + ("bool", [True, False], {True: False}, [False, False]), + ("int64", [1, 2], {1: 10, 2: 20}, [10, 20]), + ("Int64", [1, 2], {1: 10, 2: 20}, [10, 20]), + ("float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), + ("Float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), + ("string", ["one", "two"], {"one": "1", "two": "2"}, ["1", "2"]), + ( + pd.IntervalDtype("int64"), + IntervalArray([pd.Interval(1, 2), pd.Interval(2, 3)]), + {pd.Interval(1, 2): pd.Interval(10, 20)}, + IntervalArray([pd.Interval(10, 20), pd.Interval(2, 3)]), + ), + ( + pd.IntervalDtype("float64"), + IntervalArray([pd.Interval(1.0, 2.7), pd.Interval(2.8, 3.1)]), + {pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8)}, + IntervalArray([pd.Interval(10.6, 20.8), pd.Interval(2.8, 3.1)]), + ), + ( + pd.PeriodDtype("M"), + [pd.Period("2020-05", freq="M")], + {pd.Period("2020-05", freq="M"): pd.Period("2020-06", freq="M")}, + [pd.Period("2020-06", freq="M")], + ), + ], + ) + def test_replace_dtype(self, dtype, input_data, to_replace, expected_data): + # GH#33484 + ser = pd.Series(input_data, dtype=dtype) + result = ser.replace(to_replace) + expected = pd.Series(expected_data, dtype=dtype) + tm.assert_series_equal(result, expected) + + def test_replace_string_dtype(self): + # GH#40732, GH#44940 + ser = pd.Series(["one", "two", np.nan], dtype="string") + res = ser.replace({"one": "1", "two": "2"}) + expected = pd.Series(["1", "2", np.nan], dtype="string") + tm.assert_series_equal(res, expected) + + # GH#31644 + ser2 = pd.Series(["A", np.nan], dtype="string") + res2 = ser2.replace("A", "B") + expected2 = pd.Series(["B", np.nan], dtype="string") + tm.assert_series_equal(res2, expected2) + + ser3 = pd.Series(["A", "B"], dtype="string") + res3 = ser3.replace("A", pd.NA) + expected3 = pd.Series([pd.NA, "B"], dtype="string") + tm.assert_series_equal(res3, expected3) + + def test_replace_string_dtype_list_to_replace(self): + # GH#41215, GH#44940 + ser = pd.Series(["abc", "def"], dtype="string") + res = ser.replace(["abc", "any other string"], "xyz") + expected = pd.Series(["xyz", "def"], dtype="string") + tm.assert_series_equal(res, expected) + + def test_replace_string_dtype_regex(self): + # GH#31644 + ser = pd.Series(["A", "B"], dtype="string") + res = ser.replace(r".", "C", regex=True) + expected = pd.Series(["C", "C"], dtype="string") + tm.assert_series_equal(res, expected) + + def test_replace_nullable_numeric(self): + # GH#40732, GH#44940 + + floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype()) + assert floats.replace({1.0: 9}).dtype == floats.dtype + assert floats.replace(1.0, 9).dtype == floats.dtype + assert floats.replace({1.0: 9.0}).dtype == floats.dtype + assert floats.replace(1.0, 9.0).dtype == floats.dtype + + res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0]) + assert res.dtype == floats.dtype + + ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype()) + assert ints.replace({1: 9}).dtype == ints.dtype + assert ints.replace(1, 9).dtype == ints.dtype + assert ints.replace({1: 9.0}).dtype == ints.dtype + assert ints.replace(1, 9.0).dtype == ints.dtype + # FIXME: ints.replace({1: 9.5}) raises bc of incorrect _can_hold_element diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py deleted file mode 100644 index 4fb378720d89d9..00000000000000 --- a/pandas/tests/series/methods/test_shift.py +++ /dev/null @@ -1,378 +0,0 @@ -import numpy as np -import pytest - -from pandas.errors import NullFrequencyError - -import pandas as pd -from pandas import ( - DatetimeIndex, - Index, - NaT, - Series, - TimedeltaIndex, - date_range, - offsets, -) -import pandas._testing as tm - -from pandas.tseries.offsets import BDay - - -class TestShift: - @pytest.mark.parametrize( - "ser", - [ - Series([np.arange(5)]), - date_range("1/1/2011", periods=24, freq="H"), - Series(range(5), index=date_range("2017", periods=5)), - ], - ) - @pytest.mark.parametrize("shift_size", [0, 1, 2]) - def test_shift_always_copy(self, ser, shift_size): - # GH22397 - assert ser.shift(shift_size) is not ser - - @pytest.mark.parametrize("move_by_freq", [pd.Timedelta("1D"), pd.Timedelta("1min")]) - def test_datetime_shift_always_copy(self, move_by_freq): - # GH#22397 - ser = Series(range(5), index=date_range("2017", periods=5)) - assert ser.shift(freq=move_by_freq) is not ser - - def test_shift(self, datetime_series): - shifted = datetime_series.shift(1) - unshifted = shifted.shift(-1) - - tm.assert_index_equal(shifted.index, datetime_series.index) - tm.assert_index_equal(unshifted.index, datetime_series.index) - tm.assert_numpy_array_equal( - unshifted.dropna().values, datetime_series.values[:-1] - ) - - offset = BDay() - shifted = datetime_series.shift(1, freq=offset) - unshifted = shifted.shift(-1, freq=offset) - - tm.assert_series_equal(unshifted, datetime_series) - - unshifted = datetime_series.shift(0, freq=offset) - tm.assert_series_equal(unshifted, datetime_series) - - shifted = datetime_series.shift(1, freq="B") - unshifted = shifted.shift(-1, freq="B") - - tm.assert_series_equal(unshifted, datetime_series) - - # corner case - unshifted = datetime_series.shift(0) - tm.assert_series_equal(unshifted, datetime_series) - - # Shifting with PeriodIndex - ps = tm.makePeriodSeries() - shifted = ps.shift(1) - unshifted = shifted.shift(-1) - tm.assert_index_equal(shifted.index, ps.index) - tm.assert_index_equal(unshifted.index, ps.index) - tm.assert_numpy_array_equal(unshifted.dropna().values, ps.values[:-1]) - - shifted2 = ps.shift(1, "B") - shifted3 = ps.shift(1, BDay()) - tm.assert_series_equal(shifted2, shifted3) - tm.assert_series_equal(ps, shifted2.shift(-1, "B")) - - msg = "Given freq D does not match PeriodIndex freq B" - with pytest.raises(ValueError, match=msg): - ps.shift(freq="D") - - # legacy support - shifted4 = ps.shift(1, freq="B") - tm.assert_series_equal(shifted2, shifted4) - - shifted5 = ps.shift(1, freq=BDay()) - tm.assert_series_equal(shifted5, shifted4) - - # 32-bit taking - # GH#8129 - index = date_range("2000-01-01", periods=5) - for dtype in ["int32", "int64"]: - s1 = Series(np.arange(5, dtype=dtype), index=index) - p = s1.iloc[1] - result = s1.shift(periods=p) - expected = Series([np.nan, 0, 1, 2, 3], index=index) - tm.assert_series_equal(result, expected) - - # GH#8260 - # with tz - s = Series( - date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" - ) - result = s - s.shift() - - exp = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") - tm.assert_series_equal(result, exp) - - # incompat tz - s2 = Series(date_range("2000-01-01 09:00:00", periods=5, tz="CET"), name="foo") - msg = "DatetimeArray subtraction must have the same timezones or no timezones" - with pytest.raises(TypeError, match=msg): - s - s2 - - def test_shift2(self): - ts = Series( - np.random.randn(5), index=date_range("1/1/2000", periods=5, freq="H") - ) - - result = ts.shift(1, freq="5T") - exp_index = ts.index.shift(1, freq="5T") - tm.assert_index_equal(result.index, exp_index) - - # GH#1063, multiple of same base - result = ts.shift(1, freq="4H") - exp_index = ts.index + offsets.Hour(4) - tm.assert_index_equal(result.index, exp_index) - - idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) - msg = "Cannot shift with no freq" - with pytest.raises(NullFrequencyError, match=msg): - idx.shift(1) - - def test_shift_fill_value(self): - # GH#24128 - ts = Series( - [1.0, 2.0, 3.0, 4.0, 5.0], index=date_range("1/1/2000", periods=5, freq="H") - ) - - exp = Series( - [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("1/1/2000", periods=5, freq="H") - ) - # check that fill value works - result = ts.shift(1, fill_value=0.0) - tm.assert_series_equal(result, exp) - - exp = Series( - [0.0, 0.0, 1.0, 2.0, 3.0], index=date_range("1/1/2000", periods=5, freq="H") - ) - result = ts.shift(2, fill_value=0.0) - tm.assert_series_equal(result, exp) - - ts = Series([1, 2, 3]) - res = ts.shift(2, fill_value=0) - assert res.dtype == ts.dtype - - def test_shift_categorical_fill_value(self): - ts = Series(["a", "b", "c", "d"], dtype="category") - res = ts.shift(1, fill_value="a") - expected = Series( - pd.Categorical( - ["a", "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False - ) - ) - tm.assert_equal(res, expected) - - # check for incorrect fill_value - msg = r"Cannot setitem on a Categorical with a new category \(f\)" - with pytest.raises(TypeError, match=msg): - ts.shift(1, fill_value="f") - - def test_shift_dst(self): - # GH#13926 - dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") - s = Series(dates) - - res = s.shift(0) - tm.assert_series_equal(res, s) - assert res.dtype == "datetime64[ns, US/Eastern]" - - res = s.shift(1) - exp_vals = [NaT] + dates.astype(object).values.tolist()[:9] - exp = Series(exp_vals) - tm.assert_series_equal(res, exp) - assert res.dtype == "datetime64[ns, US/Eastern]" - - res = s.shift(-2) - exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT] - exp = Series(exp_vals) - tm.assert_series_equal(res, exp) - assert res.dtype == "datetime64[ns, US/Eastern]" - - for ex in [10, -10, 20, -20]: - res = s.shift(ex) - exp = Series([NaT] * 10, dtype="datetime64[ns, US/Eastern]") - tm.assert_series_equal(res, exp) - assert res.dtype == "datetime64[ns, US/Eastern]" - - @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") - def test_tshift(self, datetime_series): - # TODO(2.0): remove this test when tshift deprecation is enforced - - # PeriodIndex - ps = tm.makePeriodSeries() - shifted = ps.tshift(1) - unshifted = shifted.tshift(-1) - - tm.assert_series_equal(unshifted, ps) - - shifted2 = ps.tshift(freq="B") - tm.assert_series_equal(shifted, shifted2) - - shifted3 = ps.tshift(freq=BDay()) - tm.assert_series_equal(shifted, shifted3) - - msg = "Given freq M does not match PeriodIndex freq B" - with pytest.raises(ValueError, match=msg): - ps.tshift(freq="M") - - # DatetimeIndex - shifted = datetime_series.tshift(1) - unshifted = shifted.tshift(-1) - - tm.assert_series_equal(datetime_series, unshifted) - - shifted2 = datetime_series.tshift(freq=datetime_series.index.freq) - tm.assert_series_equal(shifted, shifted2) - - inferred_ts = Series( - datetime_series.values, Index(np.asarray(datetime_series.index)), name="ts" - ) - shifted = inferred_ts.tshift(1) - expected = datetime_series.tshift(1) - expected.index = expected.index._with_freq(None) - tm.assert_series_equal(shifted, expected) - - unshifted = shifted.tshift(-1) - tm.assert_series_equal(unshifted, inferred_ts) - - no_freq = datetime_series[[0, 5, 7]] - msg = "Freq was not set in the index hence cannot be inferred" - with pytest.raises(ValueError, match=msg): - no_freq.tshift() - - def test_tshift_deprecated(self, datetime_series): - # GH#11631 - with tm.assert_produces_warning(FutureWarning): - datetime_series.tshift() - - def test_period_index_series_shift_with_freq(self): - ps = tm.makePeriodSeries() - - shifted = ps.shift(1, freq="infer") - unshifted = shifted.shift(-1, freq="infer") - tm.assert_series_equal(unshifted, ps) - - shifted2 = ps.shift(freq="B") - tm.assert_series_equal(shifted, shifted2) - - shifted3 = ps.shift(freq=BDay()) - tm.assert_series_equal(shifted, shifted3) - - def test_datetime_series_shift_with_freq(self, datetime_series): - shifted = datetime_series.shift(1, freq="infer") - unshifted = shifted.shift(-1, freq="infer") - tm.assert_series_equal(datetime_series, unshifted) - - shifted2 = datetime_series.shift(freq=datetime_series.index.freq) - tm.assert_series_equal(shifted, shifted2) - - inferred_ts = Series( - datetime_series.values, Index(np.asarray(datetime_series.index)), name="ts" - ) - shifted = inferred_ts.shift(1, freq="infer") - expected = datetime_series.shift(1, freq="infer") - expected.index = expected.index._with_freq(None) - tm.assert_series_equal(shifted, expected) - - unshifted = shifted.shift(-1, freq="infer") - tm.assert_series_equal(unshifted, inferred_ts) - - def test_period_index_series_shift_with_freq_error(self): - ps = tm.makePeriodSeries() - msg = "Given freq M does not match PeriodIndex freq B" - with pytest.raises(ValueError, match=msg): - ps.shift(freq="M") - - def test_datetime_series_shift_with_freq_error(self, datetime_series): - no_freq = datetime_series[[0, 5, 7]] - msg = "Freq was not set in the index hence cannot be inferred" - with pytest.raises(ValueError, match=msg): - no_freq.shift(freq="infer") - - def test_shift_int(self, datetime_series): - ts = datetime_series.astype(int) - shifted = ts.shift(1) - expected = ts.astype(float).shift(1) - tm.assert_series_equal(shifted, expected) - - def test_shift_object_non_scalar_fill(self): - # shift requires scalar fill_value except for object dtype - ser = Series(range(3)) - with pytest.raises(ValueError, match="fill_value must be a scalar"): - ser.shift(1, fill_value=[]) - - df = ser.to_frame() - with pytest.raises(ValueError, match="fill_value must be a scalar"): - df.shift(1, fill_value=np.arange(3)) - - obj_ser = ser.astype(object) - result = obj_ser.shift(1, fill_value={}) - assert result[0] == {} - - obj_df = obj_ser.to_frame() - result = obj_df.shift(1, fill_value={}) - assert result.iloc[0, 0] == {} - - def test_shift_categorical(self): - # GH#9416 - s = Series(["a", "b", "c", "d"], dtype="category") - - tm.assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna()) - - sp1 = s.shift(1) - tm.assert_index_equal(s.index, sp1.index) - assert np.all(sp1.values.codes[:1] == -1) - assert np.all(s.values.codes[:-1] == sp1.values.codes[1:]) - - sn2 = s.shift(-2) - tm.assert_index_equal(s.index, sn2.index) - assert np.all(sn2.values.codes[-2:] == -1) - assert np.all(s.values.codes[2:] == sn2.values.codes[:-2]) - - tm.assert_index_equal(s.values.categories, sp1.values.categories) - tm.assert_index_equal(s.values.categories, sn2.values.categories) - - def test_shift_dt64values_int_fill_deprecated(self): - # GH#31971 - ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) - - with tm.assert_produces_warning(FutureWarning): - result = ser.shift(1, fill_value=0) - - expected = Series([pd.Timestamp(0), ser[0]]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("periods", [1, 2, 3, 4]) - def test_shift_preserve_freqstr(self, periods): - # GH#21275 - ser = Series( - range(periods), - index=date_range("2016-1-1 00:00:00", periods=periods, freq="H"), - ) - - result = ser.shift(1, "2H") - - expected = Series( - range(periods), - index=date_range("2016-1-1 02:00:00", periods=periods, freq="H"), - ) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "input_data, output_data", - [(np.empty(shape=(0,)), []), (np.ones(shape=(2,)), [np.nan, 1.0])], - ) - def test_shift_non_writable_array(self, input_data, output_data): - # GH21049 Verify whether non writable numpy array is shiftable - input_data.setflags(write=False) - - result = Series(input_data).shift(1) - expected = Series(output_data, dtype="float64") - - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index f4d7d41dbee04e..23c432b2d10bff 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -177,9 +177,6 @@ def test_multiple_output_binary_ufuncs( # Test that # the same conditions from binary_ufunc_scalar apply to # ufuncs with multiple outputs. - if sparse and ufunc is np.divmod: - mark = pytest.mark.xfail(reason="sparse divmod not implemented") - request.node.add_marker(mark) a1, a2 = arrays_for_binary_ufunc # work around https://github.com/pandas-dev/pandas/issues/26987 diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index f390cbf492202b..067bcf5969587d 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -919,7 +919,7 @@ def test_flags_kwarg(any_string_dtype): result = data.str.count(pat, flags=re.IGNORECASE) assert result[0] == 1 - msg = "This pattern has match groups" + msg = "has match groups" with tm.assert_produces_warning(UserWarning, match=msg): result = data.str.contains(pat, flags=re.IGNORECASE) assert result[0] diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 01a397938db52f..74458c13e8df78 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -14,21 +14,28 @@ ) -def test_split(any_string_dtype): +@pytest.mark.parametrize("method", ["split", "rsplit"]) +def test_split(any_string_dtype, method): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) - result = values.str.split("_") + result = getattr(values.str, method)("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) + +@pytest.mark.parametrize("method", ["split", "rsplit"]) +def test_split_more_than_one_char(any_string_dtype, method): # more than one char values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) - result = values.str.split("__") + result = getattr(values.str, method)("__") + exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) - result = values.str.split("__", expand=False) + result = getattr(values.str, method)("__", expand=False) tm.assert_series_equal(result, exp) + +def test_split_more_regex_split(any_string_dtype): # regex split values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.split("[,_]") @@ -44,6 +51,8 @@ def test_split_regex(any_string_dtype): exp = Series([["xxxjpgzzz", ""]]) tm.assert_series_equal(result, exp) + +def test_split_regex_explicit(any_string_dtype): # explicit regex = True split with compiled regex regex_pat = re.compile(r".jpg") values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) @@ -74,9 +83,11 @@ def test_split_regex(any_string_dtype): values.str.split(regex_pat, regex=False) -def test_split_object_mixed(): +@pytest.mark.parametrize("expand", [None, False]) +@pytest.mark.parametrize("method", ["split", "rsplit"]) +def test_split_object_mixed(expand, method): mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) - result = mixed.str.split("_") + result = getattr(mixed.str, method)("_", expand=expand) exp = Series( [ ["a", "b", "c"], @@ -92,43 +103,26 @@ def test_split_object_mixed(): assert isinstance(result, Series) tm.assert_almost_equal(result, exp) - result = mixed.str.split("_", expand=False) - assert isinstance(result, Series) - tm.assert_almost_equal(result, exp) - @pytest.mark.parametrize("method", ["split", "rsplit"]) -def test_split_n(any_string_dtype, method): +@pytest.mark.parametrize("n", [None, 0]) +def test_split_n(any_string_dtype, method, n): s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype) expected = Series([["a", "b"], pd.NA, ["b", "c"]]) - result = getattr(s.str, method)(" ", n=None) - tm.assert_series_equal(result, expected) - - result = getattr(s.str, method)(" ", n=0) + result = getattr(s.str, method)(" ", n=n) tm.assert_series_equal(result, expected) def test_rsplit(any_string_dtype): - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) - result = values.str.rsplit("_") - exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - tm.assert_series_equal(result, exp) - - # more than one char - values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) - result = values.str.rsplit("__") - tm.assert_series_equal(result, exp) - - result = values.str.rsplit("__", expand=False) - tm.assert_series_equal(result, exp) - # regex split is not supported by rsplit values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.rsplit("[,_]") exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) tm.assert_series_equal(result, exp) + +def test_rsplit_max_number(any_string_dtype): # setting max number of splits, make sure it's from reverse values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.rsplit("_", n=1) @@ -136,30 +130,6 @@ def test_rsplit(any_string_dtype): tm.assert_series_equal(result, exp) -def test_rsplit_object_mixed(): - # mixed - mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) - result = mixed.str.rsplit("_") - exp = Series( - [ - ["a", "b", "c"], - np.nan, - ["d", "e", "f"], - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ] - ) - assert isinstance(result, Series) - tm.assert_almost_equal(result, exp) - - result = mixed.str.rsplit("_", expand=False) - assert isinstance(result, Series) - tm.assert_almost_equal(result, exp) - - def test_split_blank_string(any_string_dtype): # expand blank split GH 20067 values = Series([""], name="test", dtype=any_string_dtype) @@ -167,6 +137,8 @@ def test_split_blank_string(any_string_dtype): exp = DataFrame([[]], dtype=any_string_dtype) # NOTE: this is NOT an empty df tm.assert_frame_equal(result, exp) + +def test_split_blank_string_with_non_empty(any_string_dtype): values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype) result = values.str.split(expand=True) exp = DataFrame( @@ -181,14 +153,13 @@ def test_split_blank_string(any_string_dtype): tm.assert_frame_equal(result, exp) -def test_split_noargs(any_string_dtype): +@pytest.mark.parametrize("method", ["split", "rsplit"]) +def test_split_noargs(any_string_dtype, method): # #1859 s = Series(["Wes McKinney", "Travis Oliphant"], dtype=any_string_dtype) - result = s.str.split() + result = getattr(s.str, method)() expected = ["Travis", "Oliphant"] assert result[1] == expected - result = s.str.rsplit() - assert result[1] == expected @pytest.mark.parametrize( @@ -199,17 +170,15 @@ def test_split_noargs(any_string_dtype): (["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"), ], ) -def test_split_maxsplit(data, pat, any_string_dtype): +@pytest.mark.parametrize("n", [-1, 0]) +def test_split_maxsplit(data, pat, any_string_dtype, n): # re.split 0, str.split -1 s = Series(data, dtype=any_string_dtype) - result = s.str.split(pat=pat, n=-1) + result = s.str.split(pat=pat, n=n) xp = s.str.split(pat=pat) tm.assert_series_equal(result, xp) - result = s.str.split(pat=pat, n=0) - tm.assert_series_equal(result, xp) - @pytest.mark.parametrize( "data, pat, expected", @@ -232,12 +201,14 @@ def test_split_no_pat_with_nonzero_n(data, pat, expected, any_string_dtype): tm.assert_series_equal(expected, result, check_index_type=False) -def test_split_to_dataframe(any_string_dtype): +def test_split_to_dataframe_no_splits(any_string_dtype): s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) result = s.str.split("_", expand=True) exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)}) tm.assert_frame_equal(result, exp) + +def test_split_to_dataframe(any_string_dtype): s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype) result = s.str.split("_", expand=True) exp = DataFrame( @@ -246,6 +217,8 @@ def test_split_to_dataframe(any_string_dtype): ) tm.assert_frame_equal(result, exp) + +def test_split_to_dataframe_unequal_splits(any_string_dtype): s = Series( ["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype ) @@ -263,6 +236,8 @@ def test_split_to_dataframe(any_string_dtype): ) tm.assert_frame_equal(result, exp) + +def test_split_to_dataframe_with_index(any_string_dtype): s = Series( ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype ) @@ -278,7 +253,7 @@ def test_split_to_dataframe(any_string_dtype): s.str.split("_", expand="not_a_boolean") -def test_split_to_multiindex_expand(): +def test_split_to_multiindex_expand_no_splits(): # https://github.com/pandas-dev/pandas/issues/23677 idx = Index(["nosplit", "alsonosplit", np.nan]) @@ -287,6 +262,8 @@ def test_split_to_multiindex_expand(): tm.assert_index_equal(result, exp) assert result.nlevels == 1 + +def test_split_to_multiindex_expand(): idx = Index(["some_equal_splits", "with_no_nans", np.nan, None]) result = idx.str.split("_", expand=True) exp = MultiIndex.from_tuples( @@ -300,6 +277,8 @@ def test_split_to_multiindex_expand(): tm.assert_index_equal(result, exp) assert result.nlevels == 3 + +def test_split_to_multiindex_expand_unequal_splits(): idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None]) result = idx.str.split("_", expand=True) exp = MultiIndex.from_tuples( @@ -317,12 +296,14 @@ def test_split_to_multiindex_expand(): idx.str.split("_", expand="not_a_boolean") -def test_rsplit_to_dataframe_expand(any_string_dtype): +def test_rsplit_to_dataframe_expand_no_splits(any_string_dtype): s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) result = s.str.rsplit("_", expand=True) exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype) tm.assert_frame_equal(result, exp) + +def test_rsplit_to_dataframe_expand(any_string_dtype): s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype) result = s.str.rsplit("_", expand=True) exp = DataFrame( @@ -344,6 +325,8 @@ def test_rsplit_to_dataframe_expand(any_string_dtype): ) tm.assert_frame_equal(result, exp) + +def test_rsplit_to_dataframe_expand_with_index(any_string_dtype): s = Series( ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype ) @@ -356,19 +339,23 @@ def test_rsplit_to_dataframe_expand(any_string_dtype): tm.assert_frame_equal(result, exp) -def test_rsplit_to_multiindex_expand(): +def test_rsplit_to_multiindex_expand_no_split(): idx = Index(["nosplit", "alsonosplit"]) result = idx.str.rsplit("_", expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 + +def test_rsplit_to_multiindex_expand(): idx = Index(["some_equal_splits", "with_no_nans"]) result = idx.str.rsplit("_", expand=True) exp = MultiIndex.from_tuples([("some", "equal", "splits"), ("with", "no", "nans")]) tm.assert_index_equal(result, exp) assert result.nlevels == 3 + +def test_rsplit_to_multiindex_expand_n(): idx = Index(["some_equal_splits", "with_no_nans"]) result = idx.str.rsplit("_", expand=True, n=1) exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")]) @@ -394,7 +381,7 @@ def test_split_nan_expand(any_string_dtype): assert all(x is pd.NA for x in result.iloc[1]) -def test_split_with_name(any_string_dtype): +def test_split_with_name_series(any_string_dtype): # GH 12617 # should preserve name @@ -407,6 +394,9 @@ def test_split_with_name(any_string_dtype): exp = DataFrame([["a", "b"], ["c", "d"]], dtype=any_string_dtype) tm.assert_frame_equal(res, exp) + +def test_split_with_name_index(): + # GH 12617 idx = Index(["a,b", "c,d"], name="xxx") res = idx.str.split(",") exp = Index([["a", "b"], ["c", "d"]], name="xxx") @@ -419,191 +409,210 @@ def test_split_with_name(any_string_dtype): tm.assert_index_equal(res, exp) -def test_partition_series(any_string_dtype): +@pytest.mark.parametrize( + "method, exp", + [ + [ + "partition", + [ + ("a", "__", "b__c"), + ("c", "__", "d__e"), + np.nan, + ("f", "__", "g__h"), + None, + ], + ], + [ + "rpartition", + [ + ("a__b", "__", "c"), + ("c__d", "__", "e"), + np.nan, + ("f__g", "__", "h"), + None, + ], + ], + ], +) +def test_partition_series_more_than_one_char(method, exp, any_string_dtype): # https://github.com/pandas-dev/pandas/issues/23558 - - s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype) - - result = s.str.partition("_", expand=False) - expected = Series( - [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h"), None] - ) + # more than one char + s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype) + result = getattr(s.str, method)("__", expand=False) + expected = Series(exp) tm.assert_series_equal(result, expected) - result = s.str.rpartition("_", expand=False) - expected = Series( - [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h"), None] - ) - tm.assert_series_equal(result, expected) - # more than one char - s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None]) - result = s.str.partition("__", expand=False) - expected = Series( +@pytest.mark.parametrize( + "method, exp", + [ [ - ("a", "__", "b__c"), - ("c", "__", "d__e"), - np.nan, - ("f", "__", "g__h"), - None, + "partition", + [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None], ], - ) - tm.assert_series_equal(result, expected) - - result = s.str.rpartition("__", expand=False) - expected = Series( [ - ("a__b", "__", "c"), - ("c__d", "__", "e"), - np.nan, - ("f__g", "__", "h"), - None, + "rpartition", + [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None], ], - ) - tm.assert_series_equal(result, expected) - + ], +) +def test_partition_series_none(any_string_dtype, method, exp): + # https://github.com/pandas-dev/pandas/issues/23558 # None s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype) - result = s.str.partition(expand=False) - expected = Series( - [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None] - ) + result = getattr(s.str, method)(expand=False) + expected = Series(exp) tm.assert_series_equal(result, expected) - result = s.str.rpartition(expand=False) - expected = Series( - [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None] - ) - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "method, exp", + [ + [ + "partition", + [("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None], + ], + [ + "rpartition", + [("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None], + ], + ], +) +def test_partition_series_not_split(any_string_dtype, method, exp): + # https://github.com/pandas-dev/pandas/issues/23558 # Not split s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype) - result = s.str.partition("_", expand=False) - expected = Series([("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None]) + result = getattr(s.str, method)("_", expand=False) + expected = Series(exp) tm.assert_series_equal(result, expected) - result = s.str.rpartition("_", expand=False) - expected = Series([("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None]) - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "method, exp", + [ + [ + "partition", + [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")], + ], + [ + "rpartition", + [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")], + ], + ], +) +def test_partition_series_unicode(any_string_dtype, method, exp): + # https://github.com/pandas-dev/pandas/issues/23558 # unicode s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) - result = s.str.partition("_", expand=False) - expected = Series([("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")]) + result = getattr(s.str, method)("_", expand=False) + expected = Series(exp) tm.assert_series_equal(result, expected) - result = s.str.rpartition("_", expand=False) - expected = Series([("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")]) - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("method", ["partition", "rpartition"]) +def test_partition_series_stdlib(any_string_dtype, method): + # https://github.com/pandas-dev/pandas/issues/23558 # compare to standard lib s = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"], dtype=any_string_dtype) - result = s.str.partition("_", expand=False).tolist() - assert result == [v.partition("_") for v in s] - result = s.str.rpartition("_", expand=False).tolist() - assert result == [v.rpartition("_") for v in s] + result = getattr(s.str, method)("_", expand=False).tolist() + assert result == [getattr(v, method)("_") for v in s] -def test_partition_index(): +@pytest.mark.parametrize( + "method, expand, exp, exp_levels", + [ + [ + "partition", + False, + np.array( + [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None], + dtype=object, + ), + 1, + ], + [ + "rpartition", + False, + np.array( + [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None], + dtype=object, + ), + 1, + ], + ], +) +def test_partition_index(method, expand, exp, exp_levels): # https://github.com/pandas-dev/pandas/issues/23558 values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None]) - result = values.str.partition("_", expand=False) - exp = Index( - np.array( - [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None], - dtype=object, - ) - ) + result = getattr(values.str, method)("_", expand=expand) + exp = Index(exp) tm.assert_index_equal(result, exp) - assert result.nlevels == 1 + assert result.nlevels == exp_levels - result = values.str.rpartition("_", expand=False) - exp = Index( - np.array( - [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None], - dtype=object, - ) - ) - tm.assert_index_equal(result, exp) - assert result.nlevels == 1 - result = values.str.partition("_") - exp = Index( +@pytest.mark.parametrize( + "method, exp", + [ [ - ("a", "_", "b_c"), - ("c", "_", "d_e"), - ("f", "_", "g_h"), - (np.nan, np.nan, np.nan), - (None, None, None), - ] - ) - tm.assert_index_equal(result, exp) - assert isinstance(result, MultiIndex) - assert result.nlevels == 3 - - result = values.str.rpartition("_") - exp = Index( + "partition", + { + 0: ["a", "c", np.nan, "f", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["b_c", "d_e", np.nan, "g_h", None], + }, + ], [ - ("a_b", "_", "c"), - ("c_d", "_", "e"), - ("f_g", "_", "h"), - (np.nan, np.nan, np.nan), - (None, None, None), - ] - ) - tm.assert_index_equal(result, exp) - assert isinstance(result, MultiIndex) - assert result.nlevels == 3 - - -def test_partition_to_dataframe(any_string_dtype): + "rpartition", + { + 0: ["a_b", "c_d", np.nan, "f_g", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["c", "e", np.nan, "h", None], + }, + ], + ], +) +def test_partition_to_dataframe(any_string_dtype, method, exp): # https://github.com/pandas-dev/pandas/issues/23558 s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype) - result = s.str.partition("_") + result = getattr(s.str, method)("_") expected = DataFrame( - { - 0: ["a", "c", np.nan, "f", None], - 1: ["_", "_", np.nan, "_", None], - 2: ["b_c", "d_e", np.nan, "g_h", None], - }, + exp, dtype=any_string_dtype, ) tm.assert_frame_equal(result, expected) - result = s.str.rpartition("_") - expected = DataFrame( - { - 0: ["a_b", "c_d", np.nan, "f_g", None], - 1: ["_", "_", np.nan, "_", None], - 2: ["c", "e", np.nan, "h", None], - }, - dtype=any_string_dtype, - ) - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "method, exp", + [ + [ + "partition", + { + 0: ["a", "c", np.nan, "f", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["b_c", "d_e", np.nan, "g_h", None], + }, + ], + [ + "rpartition", + { + 0: ["a_b", "c_d", np.nan, "f_g", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["c", "e", np.nan, "h", None], + }, + ], + ], +) +def test_partition_to_dataframe_from_series(any_string_dtype, method, exp): + # https://github.com/pandas-dev/pandas/issues/23558 s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype) - result = s.str.partition("_", expand=True) - expected = DataFrame( - { - 0: ["a", "c", np.nan, "f", None], - 1: ["_", "_", np.nan, "_", None], - 2: ["b_c", "d_e", np.nan, "g_h", None], - }, - dtype=any_string_dtype, - ) - tm.assert_frame_equal(result, expected) - - result = s.str.rpartition("_", expand=True) + result = getattr(s.str, method)("_", expand=True) expected = DataFrame( - { - 0: ["a_b", "c_d", np.nan, "f_g", None], - 1: ["_", "_", np.nan, "_", None], - 2: ["c", "e", np.nan, "h", None], - }, + exp, dtype=any_string_dtype, ) tm.assert_frame_equal(result, expected) @@ -619,7 +628,11 @@ def test_partition_with_name(any_string_dtype): ) tm.assert_frame_equal(result, expected) + +def test_partition_with_name_expand(any_string_dtype): + # GH 12617 # should preserve name + s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype) result = s.str.partition(",", expand=False) expected = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx") tm.assert_series_equal(result, expected) @@ -632,6 +645,9 @@ def test_partition_index_with_name(): assert result.nlevels == 3 tm.assert_index_equal(result, expected) + +def test_partition_index_with_name_expand_false(): + idx = Index(["a,b", "c,d"], name="xxx") # should preserve name result = idx.str.partition(",", expand=False) expected = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx") @@ -639,16 +655,13 @@ def test_partition_index_with_name(): tm.assert_index_equal(result, expected) -def test_partition_sep_kwarg(any_string_dtype): +@pytest.mark.parametrize("method", ["partition", "rpartition"]) +def test_partition_sep_kwarg(any_string_dtype, method): # GH 22676; depr kwarg "pat" in favor of "sep" s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) - expected = s.str.partition(sep="_") - result = s.str.partition("_") - tm.assert_frame_equal(result, expected) - - expected = s.str.rpartition(sep="_") - result = s.str.rpartition("_") + expected = getattr(s.str, method)(sep="_") + result = getattr(s.str, method)("_") tm.assert_frame_equal(result, expected) @@ -666,30 +679,23 @@ def test_get_mixed_object(): tm.assert_series_equal(result, expected) -def test_get_bounds(): +@pytest.mark.parametrize("idx", [2, -3]) +def test_get_bounds(idx): ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) - - # positive index - result = ser.str.split("_").str.get(2) - expected = Series(["3", "8", np.nan]) - tm.assert_series_equal(result, expected) - - # negative index - result = ser.str.split("_").str.get(-3) + result = ser.str.split("_").str.get(idx) expected = Series(["3", "8", np.nan]) tm.assert_series_equal(result, expected) -def test_get_complex(): +@pytest.mark.parametrize( + "idx, exp", [[2, [3, 3, np.nan, "b"]], [-1, [3, 3, np.nan, np.nan]]] +) +def test_get_complex(idx, exp): # GH 20671, getting value not in dict raising `KeyError` ser = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}]) - result = ser.str.get(1) - expected = Series([2, 2, np.nan, "a"]) - tm.assert_series_equal(result, expected) - - result = ser.str.get(-1) - expected = Series([3, 3, np.nan, np.nan]) + result = ser.str.get(idx) + expected = Series(exp) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 0de93b479e43e5..90c26a747abddb 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -12,13 +12,16 @@ def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method - if method_name == "decode": - pytest.skip("decode requires bytes.") data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype=nullable_string_dtype) + if method_name == "decode": + with pytest.raises(TypeError, match="a bytes-like object is required"): + getattr(b.str, method_name)(*args, **kwargs) + return + expected = getattr(a.str, method_name)(*args, **kwargs) result = getattr(b.str, method_name)(*args, **kwargs) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 3e49d6367ffd9a..af6ffcb2a93796 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -137,15 +137,11 @@ def test_repeat_mixed_object(): tm.assert_series_equal(result, expected) -def test_repeat_with_null(any_string_dtype): +@pytest.mark.parametrize("arg, repeat", [[None, 4], ["b", None]]) +def test_repeat_with_null(any_string_dtype, arg, repeat): # GH: 31632 - ser = Series(["a", None], dtype=any_string_dtype) - result = ser.str.repeat([3, 4]) - expected = Series(["aaa", np.nan], dtype=any_string_dtype) - tm.assert_series_equal(result, expected) - - ser = Series(["a", "b"], dtype=any_string_dtype) - result = ser.str.repeat([3, None]) + ser = Series(["a", arg], dtype=any_string_dtype) + result = ser.str.repeat([3, repeat]) expected = Series(["aaa", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -397,27 +393,28 @@ def test_index_not_found_raises(index_or_series, any_string_dtype): obj.str.index("DE") -def test_index_wrong_type_raises(index_or_series, any_string_dtype): +@pytest.mark.parametrize("method", ["index", "rindex"]) +def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): obj = index_or_series([], dtype=any_string_dtype) msg = "expected a string object, not int" with pytest.raises(TypeError, match=msg): - obj.str.index(0) - - with pytest.raises(TypeError, match=msg): - obj.str.rindex(0) + getattr(obj.str, method)(0) -def test_index_missing(any_string_dtype): +@pytest.mark.parametrize( + "method, exp", + [ + ["index", [1, 1, 0]], + ["rindex", [3, 1, 2]], + ], +) +def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" - result = ser.str.index("b") - expected = Series([1, 1, 0, np.nan], dtype=expected_dtype) - tm.assert_series_equal(result, expected) - - result = ser.str.rindex("b") - expected = Series([3, 1, 2, np.nan], dtype=expected_dtype) + result = getattr(ser.str, method)("b") + expected = Series(exp + [np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -488,53 +485,51 @@ def test_slice_replace(start, stop, repl, expected, any_string_dtype): tm.assert_series_equal(result, expected) -def test_strip_lstrip_rstrip(any_string_dtype): +@pytest.mark.parametrize( + "method, exp", + [ + ["strip", ["aa", "bb", np.nan, "cc"]], + ["lstrip", ["aa ", "bb \n", np.nan, "cc "]], + ["rstrip", [" aa", " bb", np.nan, "cc"]], + ], +) +def test_strip_lstrip_rstrip(any_string_dtype, method, exp): ser = Series([" aa ", " bb \n", np.nan, "cc "], dtype=any_string_dtype) - result = ser.str.strip() - expected = Series(["aa", "bb", np.nan, "cc"], dtype=any_string_dtype) - tm.assert_series_equal(result, expected) - - result = ser.str.lstrip() - expected = Series(["aa ", "bb \n", np.nan, "cc "], dtype=any_string_dtype) - tm.assert_series_equal(result, expected) - - result = ser.str.rstrip() - expected = Series([" aa", " bb", np.nan, "cc"], dtype=any_string_dtype) + result = getattr(ser.str, method)() + expected = Series(exp, dtype=any_string_dtype) tm.assert_series_equal(result, expected) -def test_strip_lstrip_rstrip_mixed_object(): +@pytest.mark.parametrize( + "method, exp", + [ + ["strip", ["aa", np.nan, "bb"]], + ["lstrip", ["aa ", np.nan, "bb \t\n"]], + ["rstrip", [" aa", np.nan, " bb"]], + ], +) +def test_strip_lstrip_rstrip_mixed_object(method, exp): ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]) - result = ser.str.strip() - expected = Series(["aa", np.nan, "bb", np.nan, np.nan, np.nan, np.nan, np.nan]) - tm.assert_series_equal(result, expected) - - result = ser.str.lstrip() - expected = Series( - ["aa ", np.nan, "bb \t\n", np.nan, np.nan, np.nan, np.nan, np.nan] - ) - tm.assert_series_equal(result, expected) - - result = ser.str.rstrip() - expected = Series([" aa", np.nan, " bb", np.nan, np.nan, np.nan, np.nan, np.nan]) + result = getattr(ser.str, method)() + expected = Series(exp + [np.nan, np.nan, np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) -def test_strip_lstrip_rstrip_args(any_string_dtype): +@pytest.mark.parametrize( + "method, exp", + [ + ["strip", ["ABC", " BNSD", "LDFJH "]], + ["lstrip", ["ABCxx", " BNSD", "LDFJH xx"]], + ["rstrip", ["xxABC", "xx BNSD", "LDFJH "]], + ], +) +def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp): ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype) - result = ser.str.strip("x") - expected = Series(["ABC", " BNSD", "LDFJH "], dtype=any_string_dtype) - tm.assert_series_equal(result, expected) - - result = ser.str.lstrip("x") - expected = Series(["ABCxx", " BNSD", "LDFJH xx"], dtype=any_string_dtype) - tm.assert_series_equal(result, expected) - - result = ser.str.rstrip("x") - expected = Series(["xxABC", "xx BNSD", "LDFJH "], dtype=any_string_dtype) + result = getattr(ser.str, method)("x") + expected = Series(exp, dtype=any_string_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 1345a66673d1c4..fcb50e463d9f9c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -9,7 +9,6 @@ algos as libalgos, hashtable as ht, ) -from pandas.compat import np_array_datetime64_compat import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( @@ -509,7 +508,8 @@ def test_on_index_object(self): def test_dtype_preservation(self, any_numpy_dtype): # GH 15442 if any_numpy_dtype in (tm.BYTES_DTYPES + tm.STRING_DTYPES): - pytest.skip("skip string dtype") + data = [1, 2, 2] + uniques = [1, 2] elif is_integer_dtype(any_numpy_dtype): data = [1, 2, 2] uniques = [1, 2] @@ -533,14 +533,17 @@ def test_dtype_preservation(self, any_numpy_dtype): result = Series(data, dtype=any_numpy_dtype).unique() expected = np.array(uniques, dtype=any_numpy_dtype) + if any_numpy_dtype in tm.STRING_DTYPES: + expected = expected.astype(object) + tm.assert_numpy_array_equal(result, expected) def test_datetime64_dtype_array_returned(self): # GH 9431 - expected = np_array_datetime64_compat( + expected = np.array( [ - "2015-01-03T00:00:00.000000000+0000", - "2015-01-01T00:00:00.000000000+0000", + "2015-01-03T00:00:00.000000000", + "2015-01-01T00:00:00.000000000", ], dtype="M8[ns]", ) @@ -1718,9 +1721,9 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): ], ) def test_hashtable_large_sizehint(self, hashtable): - # GH 22729 + # GH#22729 smoketest for not raising when passing a large size_hint size_hint = np.iinfo(np.uint32).max + 1 - tbl = hashtable(size_hint=size_hint) # noqa + hashtable(size_hint=size_hint) def test_unique_label_indices(): diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index f81e3d61c8ba50..0850ba66bbdbd5 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -25,7 +25,9 @@ def fn(x): class somecall: def __call__(self): - return x # noqa + # This shouldn't actually get called below; somecall.__init__ + # should. + raise NotImplementedError assert getname(fn) == "fn" assert getname(lambda_) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 6e386869c15f89..d8afb4ab83dfd8 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -48,11 +48,6 @@ @pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr") class TestExpressions: def setup_method(self, method): - - self.frame = _frame.copy() - self.frame2 = _frame2.copy() - self.mixed = _mixed.copy() - self.mixed2 = _mixed2.copy() self._MIN_ELEMENTS = expr._MIN_ELEMENTS def teardown_method(self, method): @@ -75,50 +70,36 @@ def call_op(df, other, flex: bool, opname: str): result = op(df, other) return result, expected - def run_arithmetic(self, df, other, flex: bool): - expr._MIN_ELEMENTS = 0 - operations = ["add", "sub", "mul", "mod", "truediv", "floordiv"] - for arith in operations: - result, expected = self.call_op(df, other, flex, arith) - - if arith == "truediv": - if expected.ndim == 1: - assert expected.dtype.kind == "f" - else: - assert all(x.kind == "f" for x in expected.dtypes.values) - tm.assert_equal(expected, result) - - def run_binary(self, df, other, flex: bool): - """ - tests solely that the result is the same whether or not numexpr is - enabled. Need to test whether the function does the correct thing - elsewhere. - """ + @pytest.mark.parametrize( + "df", + [ + _integer, + _integer2, + # randint to get a case with zeros + _integer * np.random.randint(0, 2, size=np.shape(_integer)), + _frame, + _frame2, + _mixed, + _mixed2, + ], + ) + @pytest.mark.parametrize("flex", [True, False]) + @pytest.mark.parametrize( + "arith", ["add", "sub", "mul", "mod", "truediv", "floordiv"] + ) + def test_run_arithmetic(self, df, flex, arith): expr._MIN_ELEMENTS = 0 - expr.set_test_mode(True) - operations = ["gt", "lt", "ge", "le", "eq", "ne"] - - for arith in operations: - result, expected = self.call_op(df, other, flex, arith) - - used_numexpr = expr.get_test_result() - assert used_numexpr, "Did not use numexpr as expected." - tm.assert_equal(expected, result) + result, expected = self.call_op(df, df, flex, arith) - def run_frame(self, df, other, flex: bool): - self.run_arithmetic(df, other, flex) - - set_option("compute.use_numexpr", False) - binary_comp = other + 1 - set_option("compute.use_numexpr", True) - self.run_binary(df, binary_comp, flex) + if arith == "truediv": + assert all(x.kind == "f" for x in expected.dtypes.values) + tm.assert_equal(expected, result) for i in range(len(df.columns)): - self.run_arithmetic(df.iloc[:, i], other.iloc[:, i], flex) - # FIXME: dont leave commented-out - # series doesn't uses vec_compare instead of numexpr... - # binary_comp = other.iloc[:, i] + 1 - # self.run_binary(df.iloc[:, i], binary_comp, flex) + result, expected = self.call_op(df.iloc[:, i], df.iloc[:, i], flex, arith) + if arith == "truediv": + assert expected.dtype.kind == "f" + tm.assert_equal(expected, result) @pytest.mark.parametrize( "df", @@ -134,8 +115,31 @@ def run_frame(self, df, other, flex: bool): ], ) @pytest.mark.parametrize("flex", [True, False]) - def test_arithmetic(self, df, flex): - self.run_frame(df, df, flex) + def test_run_binary(self, df, flex, comparison_op): + """ + tests solely that the result is the same whether or not numexpr is + enabled. Need to test whether the function does the correct thing + elsewhere. + """ + arith = comparison_op.__name__ + set_option("compute.use_numexpr", False) + other = df.copy() + 1 + set_option("compute.use_numexpr", True) + + expr._MIN_ELEMENTS = 0 + expr.set_test_mode(True) + + result, expected = self.call_op(df, other, flex, arith) + + used_numexpr = expr.get_test_result() + assert used_numexpr, "Did not use numexpr as expected." + tm.assert_equal(expected, result) + + # FIXME: dont leave commented-out + # series doesn't uses vec_compare instead of numexpr... + # for i in range(len(df.columns)): + # binary_comp = other.iloc[:, i] + 1 + # self.run_binary(df.iloc[:, i], binary_comp, flex) def test_invalid(self): array = np.random.randn(1_000_001) @@ -351,11 +355,11 @@ def test_bool_ops_column_name_dtype(self, test_input, expected): def test_frame_series_axis(self, axis, arith): # GH#26736 Dataframe.floordiv(Series, axis=1) fails - df = self.frame + df = _frame if axis == 1: - other = self.frame.iloc[0, :] + other = df.iloc[0, :] else: - other = self.frame.iloc[:, 0] + other = df.iloc[:, 0] expr._MIN_ELEMENTS = 0 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8e80463a9c600e..1926dbbd353722 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -48,31 +48,28 @@ def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data): expected = ymd.groupby(level="month").transform(np.sum).T tm.assert_frame_equal(result, expected) - def test_binops_level(self, multiindex_year_month_day_dataframe_random_data): + @pytest.mark.parametrize("opname", ["sub", "add", "mul", "div"]) + def test_binops_level( + self, opname, multiindex_year_month_day_dataframe_random_data + ): ymd = multiindex_year_month_day_dataframe_random_data - def _check_op(opname): - op = getattr(DataFrame, opname) - with tm.assert_produces_warning(FutureWarning): - month_sums = ymd.sum(level="month") - result = op(ymd, month_sums, level="month") - - broadcasted = ymd.groupby(level="month").transform(np.sum) - expected = op(ymd, broadcasted) - tm.assert_frame_equal(result, expected) - - # Series - op = getattr(Series, opname) - result = op(ymd["A"], month_sums["A"], level="month") - broadcasted = ymd["A"].groupby(level="month").transform(np.sum) - expected = op(ymd["A"], broadcasted) - expected.name = "A" - tm.assert_series_equal(result, expected) - - _check_op("sub") - _check_op("add") - _check_op("mul") - _check_op("div") + op = getattr(DataFrame, opname) + with tm.assert_produces_warning(FutureWarning): + month_sums = ymd.sum(level="month") + result = op(ymd, month_sums, level="month") + + broadcasted = ymd.groupby(level="month").transform(np.sum) + expected = op(ymd, broadcasted) + tm.assert_frame_equal(result, expected) + + # Series + op = getattr(Series, opname) + result = op(ymd["A"], month_sums["A"], level="month") + broadcasted = ymd["A"].groupby(level="month").transform(np.sum) + expected = op(ymd["A"], broadcasted) + expected.name = "A" + tm.assert_series_equal(result, expected) def test_reindex(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data @@ -235,25 +232,25 @@ def aggf(x): tm.assert_frame_equal(leftside, rightside) - def test_std_var_pass_ddof(self): + @pytest.mark.parametrize("meth", ["var", "std"]) + def test_std_var_pass_ddof(self, meth): index = MultiIndex.from_arrays( [np.arange(5).repeat(10), np.tile(np.arange(10), 5)] ) df = DataFrame(np.random.randn(len(index), 5), index=index) - for meth in ["var", "std"]: - ddof = 4 - alt = lambda x: getattr(x, meth)(ddof=ddof) + ddof = 4 + alt = lambda x: getattr(x, meth)(ddof=ddof) - with tm.assert_produces_warning(FutureWarning): - result = getattr(df[0], meth)(level=0, ddof=ddof) - expected = df[0].groupby(level=0).agg(alt) - tm.assert_series_equal(result, expected) + with tm.assert_produces_warning(FutureWarning): + result = getattr(df[0], meth)(level=0, ddof=ddof) + expected = df[0].groupby(level=0).agg(alt) + tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - result = getattr(df, meth)(level=0, ddof=ddof) - expected = df.groupby(level=0).agg(alt) - tm.assert_frame_equal(result, expected) + with tm.assert_produces_warning(FutureWarning): + result = getattr(df, meth)(level=0, ddof=ddof) + expected = df.groupby(level=0).agg(alt) + tm.assert_frame_equal(result, expected) def test_agg_multiple_levels( self, multiindex_year_month_day_dataframe_random_data, frame_or_series @@ -284,9 +281,6 @@ def test_groupby_multilevel(self, multiindex_year_month_day_dataframe_random_dat result2 = ymd.groupby(level=ymd.index.names[:2]).mean() tm.assert_frame_equal(result, result2) - def test_groupby_multilevel_with_transform(self): - pass - def test_multilevel_consolidate(self): index = MultiIndex.from_tuples( [("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")] diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 901f07c6b757d3..ee451d02885817 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -846,7 +846,9 @@ def test_nanvar_ddof(self): # The overestimated variance. tm.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var, rtol=1e-2) - def test_ground_truth(self): + @pytest.mark.parametrize("axis", range(2)) + @pytest.mark.parametrize("ddof", range(3)) + def test_ground_truth(self, axis, ddof): # Test against values that were precomputed with Numpy. samples = np.empty((4, 4)) samples[:3, :3] = np.array( @@ -875,26 +877,22 @@ def test_ground_truth(self): ) # Test nanvar. - for axis in range(2): - for ddof in range(3): - var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof) - tm.assert_almost_equal(var[:3], variance[axis, ddof]) - assert np.isnan(var[3]) + var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof) + tm.assert_almost_equal(var[:3], variance[axis, ddof]) + assert np.isnan(var[3]) # Test nanstd. - for axis in range(2): - for ddof in range(3): - std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof) - tm.assert_almost_equal(std[:3], variance[axis, ddof] ** 0.5) - assert np.isnan(std[3]) + std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof) + tm.assert_almost_equal(std[:3], variance[axis, ddof] ** 0.5) + assert np.isnan(std[3]) - def test_nanstd_roundoff(self): + @pytest.mark.parametrize("ddof", range(3)) + def test_nanstd_roundoff(self, ddof): # Regression test for GH 10242 (test data taken from GH 10489). Ensure # that variance is stable. data = Series(766897346 * np.ones(10)) - for ddof in range(3): - result = data.std(ddof=ddof) - assert result == 0.0 + result = data.std(ddof=ddof) + assert result == 0.0 @property def prng(self): @@ -959,12 +957,12 @@ def setup_method(self, method): self.samples = np.sin(np.linspace(0, 1, 200)) self.actual_kurt = -1.2058303433799713 - def test_constant_series(self): + @pytest.mark.parametrize("val", [3075.2, 3075.3, 3075.5]) + def test_constant_series(self, val): # xref GH 11974 - for val in [3075.2, 3075.3, 3075.5]: - data = val * np.ones(300) - kurt = nanops.nankurt(data) - assert kurt == 0.0 + data = val * np.ones(300) + kurt = nanops.nankurt(data) + assert kurt == 0.0 def test_all_finite(self): alpha, beta = 0.3, 0.1 diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 6a39638af9c87c..d32c72b3df9747 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -25,10 +25,24 @@ ) +@pytest.fixture +def left_right(): + low, high, n = -1 << 10, 1 << 10, 1 << 20 + left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")) + left["left"] = left.sum(axis=1) + + # one-2-one match + i = np.random.permutation(len(left)) + right = left.iloc[i].copy() + right.columns = right.columns[:-1].tolist() + ["right"] + right.index = np.arange(len(right)) + right["right"] *= -1 + return left, right + + class TestSorting: @pytest.mark.slow def test_int64_overflow(self): - B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) A = np.arange(2500) df = DataFrame( @@ -67,17 +81,18 @@ def test_int64_overflow(self): assert left[k] == v assert len(left) == len(right) - def test_int64_overflow_moar(self): - + def test_int64_overflow_groupby_large_range(self): # GH9096 values = range(55109) data = DataFrame.from_dict({"a": values, "b": values, "c": values, "d": values}) grouped = data.groupby(["a", "b", "c", "d"]) assert len(grouped) == len(values) + @pytest.mark.parametrize("agg", ["mean", "median"]) + def test_int64_overflow_groupby_large_df_shuffled(self, agg): arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) i = np.random.choice(len(arr), len(arr) * 4) - arr = np.vstack((arr, arr[i])) # add sume duplicate rows + arr = np.vstack((arr, arr[i])) # add some duplicate rows i = np.random.permutation(len(arr)) arr = arr[i] # shuffle rows @@ -98,42 +113,98 @@ def test_int64_overflow_moar(self): assert len(gr) == len(jim) mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde")) - def aggr(func): - f = lambda a: np.fromiter(map(func, a), dtype="f8") - arr = np.vstack((f(jim.values()), f(joe.values()))).T - res = DataFrame(arr, columns=["jim", "joe"], index=mi) - return res.sort_index() - - tm.assert_frame_equal(gr.mean(), aggr(np.mean)) - tm.assert_frame_equal(gr.median(), aggr(np.median)) - - def test_lexsort_indexer(self): + f = lambda a: np.fromiter(map(getattr(np, agg), a), dtype="f8") + arr = np.vstack((f(jim.values()), f(joe.values()))).T + res = DataFrame(arr, columns=["jim", "joe"], index=mi).sort_index() + + tm.assert_frame_equal(getattr(gr, agg)(), res) + + @pytest.mark.parametrize( + "order, na_position, exp", + [ + [ + True, + "last", + list(range(5, 105)) + list(range(5)) + list(range(105, 110)), + ], + [ + True, + "first", + list(range(5)) + list(range(105, 110)) + list(range(5, 105)), + ], + [ + False, + "last", + list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)), + ], + [ + False, + "first", + list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)), + ], + ], + ) + def test_lexsort_indexer(self, order, na_position, exp): keys = [[np.nan] * 5 + list(range(100)) + [np.nan] * 5] - # orders=True, na_position='last' - result = lexsort_indexer(keys, orders=True, na_position="last") - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=True, na_position='first' - result = lexsort_indexer(keys, orders=True, na_position="first") - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=False, na_position='last' - result = lexsort_indexer(keys, orders=False, na_position="last") - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=False, na_position='first' - result = lexsort_indexer(keys, orders=False, na_position="first") - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + result = lexsort_indexer(keys, orders=order, na_position=na_position) tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - def test_nargsort(self): - # np.argsort(items) places NaNs last - items = [np.nan] * 5 + list(range(100)) + [np.nan] * 5 - # np.argsort(items2) may not place NaNs first - items2 = np.array(items, dtype="O") + @pytest.mark.parametrize( + "ascending, na_position, exp, box", + [ + [ + True, + "last", + list(range(5, 105)) + list(range(5)) + list(range(105, 110)), + list, + ], + [ + True, + "first", + list(range(5)) + list(range(105, 110)) + list(range(5, 105)), + list, + ], + [ + False, + "last", + list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)), + list, + ], + [ + False, + "first", + list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)), + list, + ], + [ + True, + "last", + list(range(5, 105)) + list(range(5)) + list(range(105, 110)), + lambda x: np.array(x, dtype="O"), + ], + [ + True, + "first", + list(range(5)) + list(range(105, 110)) + list(range(5, 105)), + lambda x: np.array(x, dtype="O"), + ], + [ + False, + "last", + list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)), + lambda x: np.array(x, dtype="O"), + ], + [ + False, + "first", + list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)), + lambda x: np.array(x, dtype="O"), + ], + ], + ) + def test_nargsort(self, ascending, na_position, exp, box): + # list places NaNs last, np.array(..., dtype="O") may not place NaNs first + items = box([np.nan] * 5 + list(range(100)) + [np.nan] * 5) # mergesort is the most difficult to get right because we want it to be # stable. @@ -143,71 +214,23 @@ def test_nargsort(self): # because quick and merge sort fall over to insertion sort for small # arrays.""" - # mergesort, ascending=True, na_position='last' - result = nargsort(items, kind="mergesort", ascending=True, na_position="last") - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='first' - result = nargsort(items, kind="mergesort", ascending=True, na_position="first") - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='last' - result = nargsort(items, kind="mergesort", ascending=False, na_position="last") - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='first' - result = nargsort(items, kind="mergesort", ascending=False, na_position="first") - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='last' - result = nargsort(items2, kind="mergesort", ascending=True, na_position="last") - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='first' - result = nargsort(items2, kind="mergesort", ascending=True, na_position="first") - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='last' - result = nargsort(items2, kind="mergesort", ascending=False, na_position="last") - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='first' result = nargsort( - items2, kind="mergesort", ascending=False, na_position="first" + items, kind="mergesort", ascending=ascending, na_position=na_position ) - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) class TestMerge: - @pytest.mark.slow - def test_int64_overflow_issues(self): - + def test_int64_overflow_outer_merge(self): # #2690, combinatorial explosion df1 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G1"]) df2 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G2"]) - - # it works! result = merge(df1, df2, how="outer") assert len(result) == 2000 - low, high, n = -1 << 10, 1 << 10, 1 << 20 - left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")) - left["left"] = left.sum(axis=1) - - # one-2-one match - i = np.random.permutation(len(left)) - right = left.iloc[i].copy() - right.columns = right.columns[:-1].tolist() + ["right"] - right.index = np.arange(len(right)) - right["right"] *= -1 + @pytest.mark.slow + def test_int64_overflow_check_sum_col(self, left_right): + left, right = left_right out = merge(left, right, how="outer") assert len(out) == len(left) @@ -216,10 +239,19 @@ def test_int64_overflow_issues(self): tm.assert_series_equal(out["left"], result, check_names=False) assert result.name is None + @pytest.mark.slow + @pytest.mark.parametrize("how", ["left", "right", "outer", "inner"]) + def test_int64_overflow_how_merge(self, left_right, how): + left, right = left_right + + out = merge(left, right, how="outer") out.sort_values(out.columns.tolist(), inplace=True) out.index = np.arange(len(out)) - for how in ["left", "right", "outer", "inner"]: - tm.assert_frame_equal(out, merge(left, right, how=how, sort=True)) + tm.assert_frame_equal(out, merge(left, right, how=how, sort=True)) + + @pytest.mark.slow + def test_int64_overflow_sort_false_order(self, left_right): + left, right = left_right # check that left merge w/ sort=False maintains left frame order out = merge(left, right, how="left", sort=False) @@ -228,8 +260,12 @@ def test_int64_overflow_issues(self): out = merge(right, left, how="left", sort=False) tm.assert_frame_equal(right, out[right.columns.tolist()]) + @pytest.mark.slow + @pytest.mark.parametrize("how", ["left", "right", "outer", "inner"]) + @pytest.mark.parametrize("sort", [True, False]) + def test_int64_overflow_one_to_many_none_match(self, how, sort): # one-2-many/none match - n = 1 << 11 + low, high, n = -1 << 10, 1 << 10, 1 << 11 left = DataFrame( np.random.randint(low, high, (n, 7)).astype("int64"), columns=list("ABCDEFG"), @@ -300,12 +336,6 @@ def align(df): df.index = np.arange(len(df)) return df - def verify_order(df): - kcols = list("ABCDEFG") - tm.assert_frame_equal( - df[kcols].copy(), df[kcols].sort_values(kcols, kind="mergesort") - ) - out = DataFrame(vals, columns=list("ABCDEFG") + ["left", "right"]) out = align(out) @@ -316,84 +346,81 @@ def verify_order(df): "outer": np.ones(len(out), dtype="bool"), } - for how in ["left", "right", "outer", "inner"]: - mask = jmask[how] - frame = align(out[mask].copy()) - assert mask.all() ^ mask.any() or how == "outer" - - for sort in [False, True]: - res = merge(left, right, how=how, sort=sort) - if sort: - verify_order(res) - - # as in GH9092 dtypes break with outer/right join - tm.assert_frame_equal( - frame, align(res), check_dtype=how not in ("right", "outer") - ) - - -def test_decons(): - def testit(codes_list, shape): - group_index = get_group_index(codes_list, shape, sort=True, xnull=True) - codes_list2 = decons_group_index(group_index, shape) + mask = jmask[how] + frame = align(out[mask].copy()) + assert mask.all() ^ mask.any() or how == "outer" - for a, b in zip(codes_list, codes_list2): - tm.assert_numpy_array_equal(a, b) + res = merge(left, right, how=how, sort=sort) + if sort: + kcols = list("ABCDEFG") + tm.assert_frame_equal( + res[kcols].copy(), res[kcols].sort_values(kcols, kind="mergesort") + ) - shape = (4, 5, 6) - codes_list = [ - np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), - np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), - np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), - ] - testit(codes_list, shape) + # as in GH9092 dtypes break with outer/right join + # 2021-12-18: dtype does not break anymore + tm.assert_frame_equal(frame, align(res)) + + +@pytest.mark.parametrize( + "codes_list, shape", + [ + [ + [ + np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), + ], + (4, 5, 6), + ], + [ + [ + np.tile(np.arange(10000, dtype=np.int64), 5), + np.tile(np.arange(10000, dtype=np.int64), 5), + ], + (10000, 10000), + ], + ], +) +def test_decons(codes_list, shape): + group_index = get_group_index(codes_list, shape, sort=True, xnull=True) + codes_list2 = decons_group_index(group_index, shape) - shape = (10000, 10000) - codes_list = [ - np.tile(np.arange(10000, dtype=np.int64), 5), - np.tile(np.arange(10000, dtype=np.int64), 5), - ] - testit(codes_list, shape) + for a, b in zip(codes_list, codes_list2): + tm.assert_numpy_array_equal(a, b) class TestSafeSort: - def test_basic_sort(self): - values = [3, 1, 2, 0, 4] - result = safe_sort(values) - expected = np.array([0, 1, 2, 3, 4]) - tm.assert_numpy_array_equal(result, expected) - - values = list("baaacb") - result = safe_sort(values) - expected = np.array(list("aaabbc"), dtype="object") - tm.assert_numpy_array_equal(result, expected) - - values = [] - result = safe_sort(values) - expected = np.array([]) + @pytest.mark.parametrize( + "arg, exp", + [ + [[3, 1, 2, 0, 4], [0, 1, 2, 3, 4]], + [list("baaacb"), np.array(list("aaabbc"), dtype=object)], + [[], []], + ], + ) + def test_basic_sort(self, arg, exp): + result = safe_sort(arg) + expected = np.array(exp) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) - def test_codes(self, verify): + @pytest.mark.parametrize( + "codes, exp_codes, na_sentinel", + [ + [[0, 1, 1, 2, 3, 0, -1, 4], [3, 1, 1, 2, 0, 3, -1, 4], -1], + [[0, 1, 1, 2, 3, 0, 99, 4], [3, 1, 1, 2, 0, 3, 99, 4], 99], + [[], [], -1], + ], + ) + def test_codes(self, verify, codes, exp_codes, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) - codes = [0, 1, 1, 2, 3, 0, -1, 4] - result, result_codes = safe_sort(values, codes, verify=verify) - expected_codes = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_codes, expected_codes) - - # na_sentinel - codes = [0, 1, 1, 2, 3, 0, 99, 4] - result, result_codes = safe_sort(values, codes, na_sentinel=99, verify=verify) - expected_codes = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_codes, expected_codes) - - codes = [] - result, result_codes = safe_sort(values, codes, verify=verify) - expected_codes = np.array([], dtype=np.intp) + result, result_codes = safe_sort( + values, codes, na_sentinel=na_sentinel, verify=verify + ) + expected_codes = np.array(exp_codes, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) @@ -411,12 +438,14 @@ def test_codes_out_of_bound(self, na_sentinel): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) - def test_mixed_integer(self): - values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) + @pytest.mark.parametrize("box", [lambda x: np.array(x, dtype=object), list]) + def test_mixed_integer(self, box): + values = box(["b", 1, 0, "a", 0, "b"]) result = safe_sort(values) expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) + def test_mixed_integer_with_codes(self): values = np.array(["b", 1, 0, "a"], dtype=object) codes = [0, 1, 2, 3, 0, -1, 1] result, result_codes = safe_sort(values, codes) @@ -425,12 +454,6 @@ def test_mixed_integer(self): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) - def test_mixed_integer_from_list(self): - values = ["b", 1, 0, "a", 0, "b"] - result = safe_sort(values) - expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) - tm.assert_numpy_array_equal(result, expected) - def test_unsortable(self): # GH 13714 arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) @@ -438,22 +461,25 @@ def test_unsortable(self): with pytest.raises(TypeError, match=msg): safe_sort(arr) - def test_exceptions(self): - with pytest.raises(TypeError, match="Only list-like objects are allowed"): - safe_sort(values=1) - - with pytest.raises(TypeError, match="Only list-like objects or None"): - safe_sort(values=[0, 1, 2], codes=1) - - with pytest.raises(ValueError, match="values should be unique"): - safe_sort(values=[0, 1, 2, 1], codes=[0, 1]) - - def test_extension_array(self): - # a = array([1, 3, np.nan, 2], dtype='Int64') - a = array([1, 3, 2], dtype="Int64") + @pytest.mark.parametrize( + "arg, codes, err, msg", + [ + [1, None, TypeError, "Only list-like objects are allowed"], + [[0, 1, 2], 1, TypeError, "Only list-like objects or None"], + [[0, 1, 2, 1], [0, 1], ValueError, "values should be unique"], + ], + ) + def test_exceptions(self, arg, codes, err, msg): + with pytest.raises(err, match=msg): + safe_sort(values=arg, codes=codes) + + @pytest.mark.parametrize( + "arg, exp", [[[1, 3, 2], [1, 2, 3]], [[1, 3, np.nan, 2], [1, 2, 3, np.nan]]] + ) + def test_extension_array(self, arg, exp): + a = array(arg, dtype="Int64") result = safe_sort(a) - # expected = array([1, 2, 3, np.nan], dtype='Int64') - expected = array([1, 2, 3], dtype="Int64") + expected = array(exp, dtype="Int64") tm.assert_extension_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 3842e9a625b8b0..013af7eb90cd34 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -65,29 +65,44 @@ def test_to_datetime_readonly(self, readonly): expected = to_datetime([]) tm.assert_index_equal(result, expected) - def test_to_datetime_format(self, cache): - values = ["1/1/2000", "1/2/2000", "1/3/2000"] - - results1 = [Timestamp("20000101"), Timestamp("20000201"), Timestamp("20000301")] - results2 = [Timestamp("20000101"), Timestamp("20000102"), Timestamp("20000103")] - for vals, expecteds in [ - (values, (Index(results1), Index(results2))), - (Series(values), (Series(results1), Series(results2))), - (values[0], (results1[0], results2[0])), - (values[1], (results1[1], results2[1])), - (values[2], (results1[2], results2[2])), - ]: - - for i, fmt in enumerate(["%d/%m/%Y", "%m/%d/%Y"]): - result = to_datetime(vals, format=fmt, cache=cache) - expected = expecteds[i] - - if isinstance(expected, Series): - tm.assert_series_equal(result, Series(expected)) - elif isinstance(expected, Timestamp): - assert result == expected - else: - tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("box", [Series, Index]) + @pytest.mark.parametrize( + "format, expected", + [ + [ + "%d/%m/%Y", + [Timestamp("20000101"), Timestamp("20000201"), Timestamp("20000301")], + ], + [ + "%m/%d/%Y", + [Timestamp("20000101"), Timestamp("20000102"), Timestamp("20000103")], + ], + ], + ) + def test_to_datetime_format(self, cache, box, format, expected): + values = box(["1/1/2000", "1/2/2000", "1/3/2000"]) + result = to_datetime(values, format=format, cache=cache) + expected = box(expected) + if isinstance(expected, Series): + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "arg, expected, format", + [ + ["1/1/2000", "20000101", "%d/%m/%Y"], + ["1/1/2000", "20000101", "%m/%d/%Y"], + ["1/2/2000", "20000201", "%d/%m/%Y"], + ["1/2/2000", "20000102", "%m/%d/%Y"], + ["1/3/2000", "20000301", "%d/%m/%Y"], + ["1/3/2000", "20000103", "%m/%d/%Y"], + ], + ) + def test_to_datetime_format_scalar(self, cache, arg, expected, format): + result = to_datetime(arg, format=format, cache=cache) + expected = Timestamp(expected) + assert result == expected def test_to_datetime_format_YYYYMMDD(self, cache): ser = Series([19801222, 19801222] + [19810105] * 5) @@ -99,6 +114,8 @@ def test_to_datetime_format_YYYYMMDD(self, cache): result = to_datetime(ser.apply(str), format="%Y%m%d", cache=cache) tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): + ser = Series([19801222, 19801222] + [19810105] * 5) # with NaT expected = Series( [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5 @@ -115,7 +132,7 @@ def test_to_datetime_format_YYYYMMDD(self, cache): result = to_datetime(ser2, format="%Y%m%d", cache=cache) tm.assert_series_equal(result, expected) - def test_to_datetime_format_YYYYMMDD_coercion(self, cache): + def test_to_datetime_format_YYYYMMDD_ignore(self, cache): # coercion # GH 7930 ser = Series([20121231, 20141231, 99991231]) @@ -126,6 +143,10 @@ def test_to_datetime_format_YYYYMMDD_coercion(self, cache): ) tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_coercion(self, cache): + # coercion + # GH 7930 + ser = Series([20121231, 20141231, 99991231]) result = to_datetime(ser, format="%Y%m%d", errors="coerce", cache=cache) expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") tm.assert_series_equal(result, expected) @@ -243,9 +264,6 @@ def test_int_to_datetime_format_YYYYMMDD_typeerror(self, int_date, expected): assert result == expected def test_to_datetime_format_microsecond(self, cache): - - # these are locale dependent - lang, _ = locale.getlocale() month_abbr = calendar.month_abbr[4] val = f"01-{month_abbr}-2011 00:00:01.978" @@ -254,8 +272,9 @@ def test_to_datetime_format_microsecond(self, cache): exp = datetime.strptime(val, format) assert result == exp - def test_to_datetime_format_time(self, cache): - data = [ + @pytest.mark.parametrize( + "value, format, dt", + [ ["01/10/2010 15:20", "%m/%d/%Y %H:%M", Timestamp("2010-01-10 15:20")], ["01/10/2010 05:43", "%m/%d/%Y %I:%M", Timestamp("2010-01-10 05:43")], [ @@ -263,23 +282,37 @@ def test_to_datetime_format_time(self, cache): "%m/%d/%Y %H:%M:%S", Timestamp("2010-01-10 13:56:01"), ], - ] - locale_specific = [ - ["01/10/2010 08:14 PM", "%m/%d/%Y %I:%M %p", Timestamp("2010-01-10 20:14")], - ["01/10/2010 07:40 AM", "%m/%d/%Y %I:%M %p", Timestamp("2010-01-10 07:40")], - [ + pytest.param( + "01/10/2010 08:14 PM", + "%m/%d/%Y %I:%M %p", + Timestamp("2010-01-10 20:14"), + marks=pytest.mark.xfail( + locale.getlocale()[0] == "zh_CN", + reason="fail on a CI build with LC_ALL=zh_CN.utf8", + ), + ), + pytest.param( + "01/10/2010 07:40 AM", + "%m/%d/%Y %I:%M %p", + Timestamp("2010-01-10 07:40"), + marks=pytest.mark.xfail( + locale.getlocale()[0] == "zh_CN", + reason="fail on a CI build with LC_ALL=zh_CN.utf8", + ), + ), + pytest.param( "01/10/2010 09:12:56 AM", "%m/%d/%Y %I:%M:%S %p", Timestamp("2010-01-10 09:12:56"), - ], - ] - if locale.getlocale()[0] == "en_US": - # this fail on a CI build with LC_ALL=zh_CN.utf8, so en_US - # may be more specific than necessary. - data.extend(locale_specific) - - for value, format, dt in data: - assert to_datetime(value, format=format, cache=cache) == dt + marks=pytest.mark.xfail( + locale.getlocale()[0] == "zh_CN", + reason="fail on a CI build with LC_ALL=zh_CN.utf8", + ), + ), + ], + ) + def test_to_datetime_format_time(self, cache, value, format, dt): + assert to_datetime(value, format=format, cache=cache) == dt @td.skip_if_has_locale def test_to_datetime_with_non_exact(self, cache): @@ -295,20 +328,23 @@ def test_to_datetime_with_non_exact(self, cache): ) tm.assert_series_equal(result, expected) - def test_parse_nanoseconds_with_formula(self, cache): - - # GH8989 - # truncating the nanoseconds when a format was provided - for v in [ + @pytest.mark.parametrize( + "arg", + [ "2012-01-01 09:00:00.000000001", "2012-01-01 09:00:00.000001", "2012-01-01 09:00:00.001", "2012-01-01 09:00:00.001000", "2012-01-01 09:00:00.001000000", - ]: - expected = to_datetime(v, cache=cache) - result = to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f", cache=cache) - assert result == expected + ], + ) + def test_parse_nanoseconds_with_formula(self, cache, arg): + + # GH8989 + # truncating the nanoseconds when a format was provided + expected = to_datetime(arg, cache=cache) + result = to_datetime(arg, format="%Y-%m-%d %H:%M:%S.%f", cache=cache) + assert result == expected @pytest.mark.parametrize( "value,fmt,expected", @@ -537,9 +573,6 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr - result = to_datetime(arr) - assert result is arr - def test_to_datetime_pydatetime(self): actual = to_datetime(datetime(2008, 1, 15)) assert actual == datetime(2008, 1, 15) @@ -570,14 +603,15 @@ def test_to_datetime_now(self): assert pdnow2.tzinfo is None @td.skip_if_windows # `tm.set_timezone` does not work in windows - def test_to_datetime_today(self): + @pytest.mark.parametrize("tz", ["Pacific/Auckland", "US/Samoa"]) + def test_to_datetime_today(self, tz): # See GH#18666 # Test with one timezone far ahead of UTC and another far behind, so # one of these will _almost_ always be in a different day from UTC. # Unfortunately this test between 12 and 1 AM Samoa time # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. - with tm.set_timezone("Pacific/Auckland"): # 12-13 hours ahead of UTC + with tm.set_timezone(tz): nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) pdtoday = to_datetime("today") pdtoday2 = to_datetime(["today"])[0] @@ -595,28 +629,15 @@ def test_to_datetime_today(self): assert pdtoday.tzinfo is None assert pdtoday2.tzinfo is None - with tm.set_timezone("US/Samoa"): # 11 hours behind UTC - nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) - pdtoday = to_datetime("today") - pdtoday2 = to_datetime(["today"])[0] + @pytest.mark.parametrize("arg", ["now", "today"]) + def test_to_datetime_today_now_unicode_bytes(self, arg): + to_datetime([arg]) - # These should all be equal with infinite perf; this gives - # a generous margin of 10 seconds - assert abs(pdtoday.normalize().value - nptoday) < 1e10 - assert abs(pdtoday2.normalize().value - nptoday) < 1e10 - - assert pdtoday.tzinfo is None - assert pdtoday2.tzinfo is None - - def test_to_datetime_today_now_unicode_bytes(self): - to_datetime(["now"]) - to_datetime(["today"]) - - def test_to_datetime_dt64s(self, cache): - in_bound_dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] - - for dt in in_bound_dts: - assert to_datetime(dt, cache=cache) == Timestamp(dt) + @pytest.mark.parametrize( + "dt", [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] + ) + def test_to_datetime_dt64s(self, cache, dt): + assert to_datetime(dt, cache=cache) == Timestamp(dt) @pytest.mark.parametrize( "dt", [np.datetime64("1000-01-01"), np.datetime64("5000-01-02")] @@ -681,6 +702,7 @@ def test_to_datetime_tz(self, cache): ) tm.assert_index_equal(result, expected) + def test_to_datetime_tz_mixed_raises(self, cache): # mixed tzs will raise arr = [ Timestamp("2013-01-01 13:00:00", tz="US/Pacific"), @@ -748,10 +770,17 @@ def test_to_datetime_utc_true(self, cache, init_constructor, end_constructor): expected = end_constructor(expected_data) tm.assert_equal(result, expected) + @pytest.mark.parametrize( + "scalar, expected", + [ + ["20100102 121314", Timestamp("2010-01-02 12:13:14", tz="utc")], + ["20100102 121315", Timestamp("2010-01-02 12:13:15", tz="utc")], + ], + ) + def test_to_datetime_utc_true_scalar(self, cache, scalar, expected): # Test scalar case as well - for scalar, expected in zip(data, expected_data): - result = to_datetime(scalar, format="%Y%m%d %H%M%S", utc=True, cache=cache) - assert result == expected + result = to_datetime(scalar, format="%Y%m%d %H%M%S", utc=True, cache=cache) + assert result == expected def test_to_datetime_utc_true_with_series_single_value(self, cache): # GH 15760 UTC=True with Series @@ -820,17 +849,16 @@ def test_to_datetime_tz_psycopg2(self, cache): expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]") tm.assert_index_equal(result, expected) - def test_datetime_bool(self, cache): + @pytest.mark.parametrize("arg", [True, False]) + def test_datetime_bool(self, cache, arg): # GH13176 msg = r"dtype bool cannot be converted to datetime64\[ns\]" with pytest.raises(TypeError, match=msg): - to_datetime(False) - assert to_datetime(False, errors="coerce", cache=cache) is NaT - assert to_datetime(False, errors="ignore", cache=cache) is False - with pytest.raises(TypeError, match=msg): - to_datetime(True) - assert to_datetime(True, errors="coerce", cache=cache) is NaT - assert to_datetime(True, errors="ignore", cache=cache) is True + to_datetime(arg) + assert to_datetime(arg, errors="coerce", cache=cache) is NaT + assert to_datetime(arg, errors="ignore", cache=cache) is arg + + def test_datetime_bool_arrays_mixed(self, cache): msg = f"{type(cache)} is not convertible to datetime" with pytest.raises(TypeError, match=msg): to_datetime([False, datetime.today()], cache=cache) @@ -843,13 +871,12 @@ def test_datetime_bool(self, cache): ), ) - def test_datetime_invalid_datatype(self): + @pytest.mark.parametrize("arg", [bool, to_datetime]) + def test_datetime_invalid_datatype(self, arg): # GH13176 msg = "is not convertible to datetime" with pytest.raises(TypeError, match=msg): - to_datetime(bool) - with pytest.raises(TypeError, match=msg): - to_datetime(to_datetime) + to_datetime(arg) @pytest.mark.parametrize("value", ["a", "00:01:99"]) @pytest.mark.parametrize("infer", [True, False]) @@ -1071,6 +1098,8 @@ def test_iso_8601_strings_with_different_offsets(self): expected = Index(expected) tm.assert_index_equal(result, expected) + def test_iso_8601_strings_with_different_offsets_utc(self): + ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT] result = to_datetime(ts_strings, utc=True) expected = DatetimeIndex( [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" @@ -1101,6 +1130,7 @@ def test_iso8601_strings_mixed_offsets_with_naive(self): ) tm.assert_index_equal(result, expected) + def test_iso8601_strings_mixed_offsets_with_naive_reversed(self): items = ["2018-11-28T00:00:00+12:00", "2018-11-28T00:00:00"] result = to_datetime(items, utc=True) expected = to_datetime(list(reversed(items)), utc=True)[::-1] @@ -1190,6 +1220,7 @@ def test_unit(self, cache): with pytest.raises(ValueError, match=msg): to_datetime([1], unit="D", format="%Y%m%d", cache=cache) + def test_unit_array_mixed_nans(self, cache): values = [11111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] result = to_datetime(values, unit="D", errors="ignore", cache=cache) expected = Index( @@ -1217,6 +1248,7 @@ def test_unit(self, cache): with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime(values, unit="D", errors="raise", cache=cache) + def test_unit_array_mixed_nans_large_int(self, cache): values = [1420043460000, iNaT, NaT, np.nan, "NaT"] result = to_datetime(values, errors="ignore", unit="s", cache=cache) @@ -1234,65 +1266,65 @@ def test_unit(self, cache): def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache): # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime - - try: + msg = "non convertible value foo with the unit 's'" + with pytest.raises(ValueError, match=msg): to_datetime("foo", errors="raise", unit="s", cache=cache) - except OutOfBoundsDatetime as err: - raise AssertionError("incorrect exception raised") from err - except ValueError: - pass - else: - assert False, "Failed to raise ValueError" - - def test_unit_consistency(self, cache): + @pytest.mark.parametrize("error", ["raise", "coerce", "ignore"]) + def test_unit_consistency(self, cache, error): # consistency of conversions expected = Timestamp("1970-05-09 14:25:11") - result = to_datetime(11111111, unit="s", errors="raise", cache=cache) - assert result == expected - assert isinstance(result, Timestamp) - - result = to_datetime(11111111, unit="s", errors="coerce", cache=cache) + result = to_datetime(11111111, unit="s", errors=error, cache=cache) assert result == expected assert isinstance(result, Timestamp) - result = to_datetime(11111111, unit="s", errors="ignore", cache=cache) - assert result == expected - assert isinstance(result, Timestamp) - - def test_unit_with_numeric(self, cache): - + @pytest.mark.parametrize("errors", ["ignore", "raise", "coerce"]) + @pytest.mark.parametrize("dtype", ["float64", "int64"]) + def test_unit_with_numeric(self, cache, errors, dtype): # GH 13180 # coercions from floats/ints are ok expected = DatetimeIndex(["2015-06-19 05:33:20", "2015-05-27 22:33:20"]) - arr1 = [1.434692e18, 1.432766e18] - arr2 = np.array(arr1).astype("int64") - for errors in ["ignore", "raise", "coerce"]: - result = to_datetime(arr1, errors=errors, cache=cache) - tm.assert_index_equal(result, expected) - - result = to_datetime(arr2, errors=errors, cache=cache) - tm.assert_index_equal(result, expected) + arr = np.array([1.434692e18, 1.432766e18]).astype(dtype) + result = to_datetime(arr, errors=errors, cache=cache) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "exp, arr", + [ + [ + ["NaT", "2015-06-19 05:33:20", "2015-05-27 22:33:20"], + ["foo", 1.434692e18, 1.432766e18], + ], + [ + ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"], + [1.434692e18, 1.432766e18, "foo", "NaT"], + ], + ], + ) + def test_unit_with_numeric_coerce(self, cache, exp, arr): # but we want to make sure that we are coercing # if we have ints/strings - expected = DatetimeIndex(["NaT", "2015-06-19 05:33:20", "2015-05-27 22:33:20"]) - arr = ["foo", 1.434692e18, 1.432766e18] - result = to_datetime(arr, errors="coerce", cache=cache) - tm.assert_index_equal(result, expected) - - expected = DatetimeIndex( - ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"] - ) - arr = [1.434692e18, 1.432766e18, "foo", "NaT"] + expected = DatetimeIndex(exp) result = to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) - def test_unit_mixed(self, cache): + @pytest.mark.parametrize( + "exp, arr", + [ + [ + ["2013-01-01", "NaT", "NaT"], + [Timestamp("20130101"), 1.434692e18, 1.432766e18], + ], + [ + ["NaT", "NaT", "2013-01-01"], + [1.434692e18, 1.432766e18, Timestamp("20130101")], + ], + ], + ) + def test_unit_mixed(self, cache, exp, arr): # mixed integers/datetimes - expected = DatetimeIndex(["2013-01-01", "NaT", "NaT"]) - arr = [Timestamp("20130101"), 1.434692e18, 1.432766e18] + expected = DatetimeIndex(exp) result = to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) @@ -1300,14 +1332,6 @@ def test_unit_mixed(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(arr, errors="raise", cache=cache) - expected = DatetimeIndex(["NaT", "NaT", "2013-01-01"]) - arr = [1.434692e18, 1.432766e18, Timestamp("20130101")] - result = to_datetime(arr, errors="coerce", cache=cache) - tm.assert_index_equal(result, expected) - - with pytest.raises(ValueError, match=msg): - to_datetime(arr, errors="raise", cache=cache) - def test_unit_rounding(self, cache): # GH 14156 & GH 20445: argument will incur floating point errors # but no premature rounding @@ -1328,36 +1352,26 @@ def test_to_datetime_errors_ignore_utc_true(self): tm.assert_index_equal(result, expected) # TODO: this is moved from tests.series.test_timeseries, may be redundant - def test_to_datetime_unit(self): - + @pytest.mark.parametrize("dtype", [int, float]) + def test_to_datetime_unit(self, dtype): epoch = 1370745748 - s1 = Series([epoch + t for t in range(20)]) - s2 = Series([epoch + t for t in range(20)]).astype(float) - - for ser in [s1, s2]: - result = to_datetime(ser, unit="s") - expected = Series( - [ - Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) - for t in range(20) - ] - ) - tm.assert_series_equal(result, expected) - - s1 = Series([epoch + t for t in range(20)] + [iNaT]) - s2 = Series([epoch + t for t in range(20)] + [iNaT]).astype(float) - s3 = Series([epoch + t for t in range(20)] + [np.nan]) + ser = Series([epoch + t for t in range(20)]).astype(dtype) + result = to_datetime(ser, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + ) + tm.assert_series_equal(result, expected) - for ser in [s1, s2, s3]: - result = to_datetime(ser, unit="s") - expected = Series( - [ - Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) - for t in range(20) - ] - + [NaT] - ) - tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("null", [iNaT, np.nan]) + def test_to_datetime_unit_with_nulls(self, null): + epoch = 1370745748 + ser = Series([epoch + t for t in range(20)] + [null]) + result = to_datetime(ser, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) + tm.assert_series_equal(result, expected) def test_to_datetime_unit_fractional_seconds(self): @@ -1383,23 +1397,19 @@ def test_to_datetime_unit_na_values(self): ) tm.assert_index_equal(result, expected) - def test_to_datetime_unit_invalid(self): - msg = "non convertible value foo with the unit 'D'" + @pytest.mark.parametrize("bad_val", ["foo", 111111111]) + def test_to_datetime_unit_invalid(self, bad_val): + msg = f"{bad_val} with the unit 'D'" with pytest.raises(ValueError, match=msg): - to_datetime([1, 2, "foo"], unit="D") - msg = "cannot convert input 111111111 with the unit 'D'" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime([1, 2, 111111111], unit="D") + to_datetime([1, 2, bad_val], unit="D") - def test_to_timestamp_unit_coerce(self): + @pytest.mark.parametrize("bad_val", ["foo", 111111111]) + def test_to_timestamp_unit_coerce(self, bad_val): # coerce we can process expected = DatetimeIndex( [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1 ) - result = to_datetime([1, 2, "foo"], unit="D", errors="coerce") - tm.assert_index_equal(result, expected) - - result = to_datetime([1, 2, 111111111], unit="D", errors="coerce") + result = to_datetime([1, 2, bad_val], unit="D", errors="coerce") tm.assert_index_equal(result, expected) @@ -1434,6 +1444,7 @@ def test_dataframe(self, df, cache): result = to_datetime(df[["year", "month", "day"]].to_dict(), cache=cache) tm.assert_series_equal(result, expected) + def test_dataframe_dict_with_constructable(self, df, cache): # dict but with constructable df2 = df[["year", "month", "day"]].to_dict() df2["month"] = 2 @@ -1443,9 +1454,9 @@ def test_dataframe(self, df, cache): ) tm.assert_series_equal(result, expected2) - def test_dataframe_field_aliases_column_subset(self, df, cache): - # unit mappings - units = [ + @pytest.mark.parametrize( + "unit", + [ { "year": "years", "month": "months", @@ -1462,14 +1473,15 @@ def test_dataframe_field_aliases_column_subset(self, df, cache): "minute": "minute", "second": "second", }, - ] - - for d in units: - result = to_datetime(df[list(d.keys())].rename(columns=d), cache=cache) - expected = Series( - [Timestamp("20150204 06:58:10"), Timestamp("20160305 07:59:11")] - ) - tm.assert_series_equal(result, expected) + ], + ) + def test_dataframe_field_aliases_column_subset(self, df, cache, unit): + # unit mappings + result = to_datetime(df[list(unit.keys())].rename(columns=unit), cache=cache) + expected = Series( + [Timestamp("20150204 06:58:10"), Timestamp("20160305 07:59:11")] + ) + tm.assert_series_equal(result, expected) def test_dataframe_field_aliases(self, df, cache): d = { @@ -1527,21 +1539,24 @@ def test_dataframe_extra_keys_raisesm(self, df, cache): df2["foo"] = 1 to_datetime(df2, cache=cache) - def test_dataframe_missing_keys_raises(self, df, cache): - # not enough - msg = ( - r"to assemble mappings requires at least that \[year, month, " - r"day\] be specified: \[.+\] is missing" - ) - for c in [ + @pytest.mark.parametrize( + "cols", + [ ["year"], ["year", "month"], ["year", "month", "second"], ["month", "day"], ["year", "day", "second"], - ]: - with pytest.raises(ValueError, match=msg): - to_datetime(df[c], cache=cache) + ], + ) + def test_dataframe_missing_keys_raises(self, df, cache, cols): + # not enough + msg = ( + r"to assemble mappings requires at least that \[year, month, " + r"day\] be specified: \[.+\] is missing" + ) + with pytest.raises(ValueError, match=msg): + to_datetime(df[cols], cache=cache) def test_dataframe_duplicate_columns_raises(self, cache): # duplicates @@ -1558,7 +1573,7 @@ def test_dataframe_duplicate_columns_raises(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) - def test_dataframe_dtypes(self, cache): + def test_dataframe_int16(self, cache): # GH#13451 df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) @@ -1569,7 +1584,9 @@ def test_dataframe_dtypes(self, cache): ) tm.assert_series_equal(result, expected) + def test_dataframe_mixed(self, cache): # mixed dtypes + df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) df["month"] = df["month"].astype("int8") df["day"] = df["day"].astype("int8") result = to_datetime(df, cache=cache) @@ -1578,6 +1595,7 @@ def test_dataframe_dtypes(self, cache): ) tm.assert_series_equal(result, expected) + def test_dataframe_float(self, cache): # float df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) msg = "cannot assemble the datetimes: unconverted data remains: 1" @@ -1605,13 +1623,16 @@ def test_to_datetime_barely_out_of_bounds(self): with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime(arr) - def test_to_datetime_iso8601(self, cache): - result = to_datetime(["2012-01-01 00:00:00"], cache=cache) - exp = Timestamp("2012-01-01 00:00:00") - assert result[0] == exp - - result = to_datetime(["20121001"], cache=cache) # bad iso 8601 - exp = Timestamp("2012-10-01") + @pytest.mark.parametrize( + "arg, exp_str", + [ + ["2012-01-01 00:00:00", "2012-01-01 00:00:00"], + ["20121001", "2012-10-01"], # bad iso 8601 + ], + ) + def test_to_datetime_iso8601(self, cache, arg, exp_str): + result = to_datetime([arg], cache=cache) + exp = Timestamp(exp_str) assert result[0] == exp def test_to_datetime_default(self, cache): @@ -1659,6 +1680,11 @@ def test_to_datetime_with_apply(self, cache): result = td.apply(to_datetime, format="%b %y", cache=cache) tm.assert_series_equal(result, expected) + @td.skip_if_has_locale + def test_to_datetime_with_apply_with_empty_str(self, cache): + # this is only locale tested with US/None locales + # GH 5195 + # with a format and coerce a single item to_datetime fails td = Series(["May 04", "Jun 02", ""], index=[1, 2, 3]) msg = r"time data '' does not match format '%b %y' \(match\)" with pytest.raises(ValueError, match=msg): @@ -1672,15 +1698,16 @@ def test_to_datetime_with_apply(self, cache): ) tm.assert_series_equal(result, expected) - def test_to_datetime_types(self, cache): - + def test_to_datetime_empty_stt(self, cache): # empty string result = to_datetime("", cache=cache) assert result is NaT + def test_to_datetime_empty_str_list(self, cache): result = to_datetime(["", ""], cache=cache) assert isna(result).all() + def test_to_datetime_zero(self, cache): # ints result = Timestamp(0) expected = to_datetime(0, cache=cache) @@ -1692,15 +1719,15 @@ def test_to_datetime_strings(self, cache): result = to_datetime("2012", cache=cache) assert result == expected + def test_to_datetime_strings_variation(self, cache): array = ["2012", "20120101", "20120101 12:01:01"] expected = list(to_datetime(array, cache=cache)) result = [Timestamp(date_str) for date_str in array] tm.assert_almost_equal(result, expected) + @pytest.mark.parametrize("result", [Timestamp("2012"), to_datetime("2012")]) + def test_to_datetime_strings_vs_constructor(self, result): expected = Timestamp(2012, 1, 1) - result = Timestamp("2012") - assert result == expected - result = to_datetime("2012") assert result == expected def test_to_datetime_unprocessable_input(self, cache): @@ -1775,6 +1802,7 @@ def test_string_na_nat_conversion(self, cache): assert isinstance(result2, DatetimeIndex) tm.assert_numpy_array_equal(result, result2.values) + def test_string_na_nat_conversion_malformed(self, cache): malformed = np.array(["1/100/2000", np.nan], dtype=object) # GH 10636, default is now 'raise' @@ -1790,6 +1818,7 @@ def test_string_na_nat_conversion(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(malformed, errors="raise", cache=cache) + def test_string_na_nat_conversion_with_name(self, cache): idx = ["a", "b", "c", "d", "e"] series = Series( ["1/1/2000", np.nan, "1/3/2000", np.nan, "1/5/2000"], index=idx, name="foo" @@ -1862,16 +1891,12 @@ def test_dayfirst(self, cache): tm.assert_index_equal(expected, idx5) tm.assert_index_equal(expected, idx6) - def test_dayfirst_warnings(self): + def test_dayfirst_warnings_valid_input(self): # GH 12585 warning_msg_day_first = ( "Parsing '31/12/2014' in DD/MM/YYYY format. Provide " "format or specify infer_datetime_format=True for consistent parsing." ) - warning_msg_month_first = ( - "Parsing '03/30/2011' in MM/DD/YYYY format. Provide " - "format or specify infer_datetime_format=True for consistent parsing." - ) # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] @@ -1901,9 +1926,18 @@ def test_dayfirst_warnings(self): res4 = to_datetime(arr, infer_datetime_format=True) tm.assert_index_equal(expected_consistent, res4) + def test_dayfirst_warnings_invalid_input(self): # CASE 2: invalid input # cannot consistently process with single format # warnings *always* raised + warning_msg_day_first = ( + "Parsing '31/12/2014' in DD/MM/YYYY format. Provide " + "format or specify infer_datetime_format=True for consistent parsing." + ) + warning_msg_month_first = ( + "Parsing '03/30/2011' in MM/DD/YYYY format. Provide " + "format or specify infer_datetime_format=True for consistent parsing." + ) arr = ["31/12/2014", "03/30/2011"] # first in DD/MM/YYYY, second in MM/DD/YYYY @@ -1946,19 +1980,24 @@ def test_to_datetime_dta_tz(self, klass): class TestGuessDatetimeFormat: @td.skip_if_not_us_locale - def test_guess_datetime_format_for_array(self): + @pytest.mark.parametrize( + "test_array", + [ + [ + "2011-12-30 00:00:00.000000", + "2011-12-30 00:00:00.000000", + "2011-12-30 00:00:00.000000", + ], + [np.nan, np.nan, "2011-12-30 00:00:00.000000"], + ["2011-12-30 00:00:00.000000", "random_string"], + ], + ) + def test_guess_datetime_format_for_array(self, test_array): expected_format = "%Y-%m-%d %H:%M:%S.%f" - dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) - - test_arrays = [ - np.array([dt_string, dt_string, dt_string], dtype="O"), - np.array([np.nan, np.nan, dt_string], dtype="O"), - np.array([dt_string, "random_string"], dtype="O"), - ] - - for test_array in test_arrays: - assert tools._guess_datetime_format_for_array(test_array) == expected_format + assert tools._guess_datetime_format_for_array(test_array) == expected_format + @td.skip_if_not_us_locale + def test_guess_datetime_format_for_array_all_nans(self): format_for_string_of_nans = tools._guess_datetime_format_for_array( np.array([np.nan, np.nan, np.nan], dtype="O") ) @@ -1966,33 +2005,38 @@ def test_guess_datetime_format_for_array(self): class TestToDatetimeInferFormat: - def test_to_datetime_infer_datetime_format_consistent_format(self, cache): + @pytest.mark.parametrize( + "test_format", ["%m-%d-%Y", "%m/%d/%Y %H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%f"] + ) + def test_to_datetime_infer_datetime_format_consistent_format( + self, cache, test_format + ): ser = Series(date_range("20000101", periods=50, freq="H")) - test_formats = ["%m-%d-%Y", "%m/%d/%Y %H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%f"] + s_as_dt_strings = ser.apply(lambda x: x.strftime(test_format)) - for test_format in test_formats: - s_as_dt_strings = ser.apply(lambda x: x.strftime(test_format)) - - with_format = to_datetime(s_as_dt_strings, format=test_format, cache=cache) - no_infer = to_datetime( - s_as_dt_strings, infer_datetime_format=False, cache=cache - ) - yes_infer = to_datetime( - s_as_dt_strings, infer_datetime_format=True, cache=cache - ) + with_format = to_datetime(s_as_dt_strings, format=test_format, cache=cache) + no_infer = to_datetime( + s_as_dt_strings, infer_datetime_format=False, cache=cache + ) + yes_infer = to_datetime( + s_as_dt_strings, infer_datetime_format=True, cache=cache + ) - # Whether the format is explicitly passed, it is inferred, or - # it is not inferred, the results should all be the same - tm.assert_series_equal(with_format, no_infer) - tm.assert_series_equal(no_infer, yes_infer) + # Whether the format is explicitly passed, it is inferred, or + # it is not inferred, the results should all be the same + tm.assert_series_equal(with_format, no_infer) + tm.assert_series_equal(no_infer, yes_infer) - def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache): - ser = Series( - np.array( - ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] - ) - ) + @pytest.mark.parametrize( + "data", + [ + ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"], + ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"], + ], + ) + def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache, data): + ser = Series(np.array(data)) # When the format is inconsistent, infer_datetime_format should just # fallback to the default parsing @@ -2001,13 +2045,6 @@ def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache): to_datetime(ser, infer_datetime_format=True, cache=cache), ) - ser = Series(np.array(["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"])) - - tm.assert_series_equal( - to_datetime(ser, infer_datetime_format=False, cache=cache), - to_datetime(ser, infer_datetime_format=True, cache=cache), - ) - def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): ser = Series( np.array( @@ -2052,23 +2089,24 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "ts,zero_tz,is_utc", + "ts,zero_tz", [ - ("2019-02-02 08:07:13", "Z", True), - ("2019-02-02 08:07:13", "", False), - ("2019-02-02 08:07:13.012345", "Z", True), - ("2019-02-02 08:07:13.012345", "", False), + ("2019-02-02 08:07:13", "Z"), + ("2019-02-02 08:07:13", ""), + ("2019-02-02 08:07:13.012345", "Z"), + ("2019-02-02 08:07:13.012345", ""), ], ) - def test_infer_datetime_format_zero_tz(self, ts, zero_tz, is_utc): + def test_infer_datetime_format_zero_tz(self, ts, zero_tz): # GH 41047 ser = Series([ts + zero_tz]) result = to_datetime(ser, infer_datetime_format=True) - tz = pytz.utc if is_utc else None + tz = pytz.utc if zero_tz == "Z" else None expected = Series([Timestamp(ts, tz=tz)]) tm.assert_series_equal(result, expected) - def test_to_datetime_iso8601_noleading_0s(self, cache): + @pytest.mark.parametrize("format", [None, "%Y-%m-%d"]) + def test_to_datetime_iso8601_noleading_0s(self, cache, format): # GH 11871 ser = Series(["2014-1-1", "2014-2-2", "2015-3-3"]) expected = Series( @@ -2078,58 +2116,47 @@ def test_to_datetime_iso8601_noleading_0s(self, cache): Timestamp("2015-03-03"), ] ) - tm.assert_series_equal(to_datetime(ser, cache=cache), expected) - tm.assert_series_equal( - to_datetime(ser, format="%Y-%m-%d", cache=cache), expected - ) + tm.assert_series_equal(to_datetime(ser, format=format, cache=cache), expected) class TestDaysInMonth: # tests for issue #10154 - def test_day_not_in_month_coerce(self, cache): - assert isna(to_datetime("2015-02-29", errors="coerce", cache=cache)) - assert isna( - to_datetime("2015-02-29", format="%Y-%m-%d", errors="coerce", cache=cache) - ) - assert isna( - to_datetime("2015-02-32", format="%Y-%m-%d", errors="coerce", cache=cache) - ) - assert isna( - to_datetime("2015-04-31", format="%Y-%m-%d", errors="coerce", cache=cache) - ) + @pytest.mark.parametrize( + "arg, format", + [ + ["2015-02-29", None], + ["2015-02-29", "%Y-%m-%d"], + ["2015-02-32", "%Y-%m-%d"], + ["2015-04-31", "%Y-%m-%d"], + ], + ) + def test_day_not_in_month_coerce(self, cache, arg, format): + assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) def test_day_not_in_month_raise(self, cache): msg = "day is out of range for month" with pytest.raises(ValueError, match=msg): to_datetime("2015-02-29", errors="raise", cache=cache) - msg = "time data 2015-02-29 doesn't match format specified" + @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) + def test_day_not_in_month_raise_value(self, cache, arg): + msg = f"time data {arg} doesn't match format specified" with pytest.raises(ValueError, match=msg): - to_datetime("2015-02-29", errors="raise", format="%Y-%m-%d", cache=cache) + to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache) - msg = "time data 2015-02-32 doesn't match format specified" - with pytest.raises(ValueError, match=msg): - to_datetime("2015-02-32", errors="raise", format="%Y-%m-%d", cache=cache) - - msg = "time data 2015-04-31 doesn't match format specified" - with pytest.raises(ValueError, match=msg): - to_datetime("2015-04-31", errors="raise", format="%Y-%m-%d", cache=cache) - - def test_day_not_in_month_ignore(self, cache): - assert to_datetime("2015-02-29", errors="ignore", cache=cache) == "2015-02-29" - assert ( - to_datetime("2015-02-29", errors="ignore", format="%Y-%m-%d", cache=cache) - == "2015-02-29" - ) - assert ( - to_datetime("2015-02-32", errors="ignore", format="%Y-%m-%d", cache=cache) - == "2015-02-32" - ) - assert ( - to_datetime("2015-04-31", errors="ignore", format="%Y-%m-%d", cache=cache) - == "2015-04-31" - ) + @pytest.mark.parametrize( + "expected, format", + [ + ["2015-02-29", None], + ["2015-02-29", "%Y-%m-%d"], + ["2015-02-29", "%Y-%m-%d"], + ["2015-04-31", "%Y-%m-%d"], + ], + ) + def test_day_not_in_month_ignore(self, cache, expected, format): + result = to_datetime(expected, errors="ignore", format=format, cache=cache) + assert result == expected class TestDatetimeParsingWrappers: @@ -2235,7 +2262,22 @@ def test_parsers_nat(self): assert result3 is NaT assert result4 is NaT - def test_parsers_dayfirst_yearfirst(self, cache): + @pytest.mark.parametrize( + "date_str, dayfirst, yearfirst, expected", + [ + ("10-11-12", False, False, datetime(2012, 10, 11)), + ("10-11-12", True, False, datetime(2012, 11, 10)), + ("10-11-12", False, True, datetime(2010, 11, 12)), + ("10-11-12", True, True, datetime(2010, 12, 11)), + ("20/12/21", False, False, datetime(2021, 12, 20)), + ("20/12/21", True, False, datetime(2021, 12, 20)), + ("20/12/21", False, True, datetime(2020, 12, 21)), + ("20/12/21", True, True, datetime(2020, 12, 21)), + ], + ) + def test_parsers_dayfirst_yearfirst( + self, cache, date_str, dayfirst, yearfirst, expected + ): # OK # 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2012-10-11 00:00:00 @@ -2277,72 +2319,51 @@ def test_parsers_dayfirst_yearfirst(self, cache): # 2.5.3 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 # str : dayfirst, yearfirst, expected - cases = { - "10-11-12": [ - (False, False, datetime(2012, 10, 11)), - (True, False, datetime(2012, 11, 10)), - (False, True, datetime(2010, 11, 12)), - (True, True, datetime(2010, 12, 11)), - ], - "20/12/21": [ - (False, False, datetime(2021, 12, 20)), - (True, False, datetime(2021, 12, 20)), - (False, True, datetime(2020, 12, 21)), - (True, True, datetime(2020, 12, 21)), - ], - } - for date_str, values in cases.items(): - for dayfirst, yearfirst, expected in values: + # compare with dateutil result + dateutil_result = parse(date_str, dayfirst=dayfirst, yearfirst=yearfirst) + assert dateutil_result == expected - # compare with dateutil result - dateutil_result = parse( - date_str, dayfirst=dayfirst, yearfirst=yearfirst - ) - assert dateutil_result == expected - - result1, _ = parsing.parse_time_string( - date_str, dayfirst=dayfirst, yearfirst=yearfirst - ) + result1, _ = parsing.parse_time_string( + date_str, dayfirst=dayfirst, yearfirst=yearfirst + ) - # we don't support dayfirst/yearfirst here: - if not dayfirst and not yearfirst: - result2 = Timestamp(date_str) - assert result2 == expected + # we don't support dayfirst/yearfirst here: + if not dayfirst and not yearfirst: + result2 = Timestamp(date_str) + assert result2 == expected - result3 = to_datetime( - date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache - ) + result3 = to_datetime( + date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache + ) - result4 = DatetimeIndex( - [date_str], dayfirst=dayfirst, yearfirst=yearfirst - )[0] + result4 = DatetimeIndex([date_str], dayfirst=dayfirst, yearfirst=yearfirst)[0] - assert result1 == expected - assert result3 == expected - assert result4 == expected + assert result1 == expected + assert result3 == expected + assert result4 == expected - def test_parsers_timestring(self, cache): + @pytest.mark.parametrize( + "date_str, exp_def", + [["10:15", datetime(1, 1, 1, 10, 15)], ["9:05", datetime(1, 1, 1, 9, 5)]], + ) + def test_parsers_timestring(self, date_str, exp_def): # must be the same as dateutil result - cases = { - "10:15": (parse("10:15"), datetime(1, 1, 1, 10, 15)), - "9:05": (parse("9:05"), datetime(1, 1, 1, 9, 5)), - } - - for date_str, (exp_now, exp_def) in cases.items(): - result1, _ = parsing.parse_time_string(date_str) - result2 = to_datetime(date_str) - result3 = to_datetime([date_str]) - result4 = Timestamp(date_str) - result5 = DatetimeIndex([date_str])[0] - # parse time string return time string based on default date - # others are not, and can't be changed because it is used in - # time series plot - assert result1 == exp_def - assert result2 == exp_now - assert result3 == exp_now - assert result4 == exp_now - assert result5 == exp_now + exp_now = parse(date_str) + + result1, _ = parsing.parse_time_string(date_str) + result2 = to_datetime(date_str) + result3 = to_datetime([date_str]) + result4 = Timestamp(date_str) + result5 = DatetimeIndex([date_str])[0] + # parse time string return time string based on default date + # others are not, and can't be changed because it is used in + # time series plot + assert result1 == exp_def + assert result2 == exp_now + assert result3 == exp_now + assert result4 == exp_now + assert result5 == exp_now @pytest.mark.parametrize( "dt_string, tz, dt_string_repr", @@ -2420,7 +2441,7 @@ def julian_dates(): class TestOrigin: - def test_to_basic(self, julian_dates): + def test_julian(self, julian_dates): # gh-11276, gh-11745 # for origin as julian @@ -2430,19 +2451,13 @@ def test_to_basic(self, julian_dates): ) tm.assert_series_equal(result, expected) + def test_unix(self): result = Series(to_datetime([0, 1, 2], unit="D", origin="unix")) expected = Series( [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")] ) tm.assert_series_equal(result, expected) - # default - result = Series(to_datetime([0, 1, 2], unit="D")) - expected = Series( - [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")] - ) - tm.assert_series_equal(result, expected) - def test_julian_round_trip(self): result = to_datetime(2456658, origin="julian", unit="D") assert result.to_julian_date() == 2456658 @@ -2460,15 +2475,13 @@ def test_invalid_unit(self, units, julian_dates): with pytest.raises(ValueError, match=msg): to_datetime(julian_dates, unit=units, origin="julian") - def test_invalid_origin(self): + @pytest.mark.parametrize("unit", ["ns", "D"]) + def test_invalid_origin(self, unit): # need to have a numeric specified msg = "it must be numeric with a unit specified" with pytest.raises(ValueError, match=msg): - to_datetime("2005-01-01", origin="1960-01-01") - - with pytest.raises(ValueError, match=msg): - to_datetime("2005-01-01", origin="1960-01-01", unit="D") + to_datetime("2005-01-01", origin="1960-01-01", unit=unit) def test_epoch(self, units, epochs, epoch_1960, units_from_epochs): @@ -2506,12 +2519,20 @@ def test_to_datetime_out_of_bounds_with_format_arg(self, format): with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime("2417-10-27 00:00:00", format=format) - def test_processing_order(self): + @pytest.mark.parametrize( + "arg, origin, expected_str", + [ + [200 * 365, "unix", "2169-11-13 00:00:00"], + [200 * 365, "1870-01-01", "2069-11-13 00:00:00"], + [300 * 365, "1870-01-01", "2169-10-20 00:00:00"], + ], + ) + def test_processing_order(self, arg, origin, expected_str): # make sure we handle out-of-bounds *before* # constructing the dates - result = to_datetime(200 * 365, unit="D") - expected = Timestamp("2169-11-13 00:00:00") + result = to_datetime(arg, unit="D", origin=origin) + expected = Timestamp(expected_str) assert result == expected result = to_datetime(200 * 365, unit="D", origin="1870-01-01") @@ -2626,7 +2647,7 @@ def test_empty_string_datetime_coerce_format(): # raise an exception in case a format is given with pytest.raises(ValueError, match="does not match format"): - result = to_datetime(td, format=format, errors="raise") + to_datetime(td, format=format, errors="raise") # don't raise an exception in case no format is given result = to_datetime(td, errors="raise") diff --git a/pandas/tests/tools/test_to_time.py b/pandas/tests/tools/test_to_time.py index bfd347fd122c36..968102ce9edded 100644 --- a/pandas/tests/tools/test_to_time.py +++ b/pandas/tests/tools/test_to_time.py @@ -1,42 +1,47 @@ from datetime import time +import locale import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import Series import pandas._testing as tm from pandas.core.tools.datetimes import to_time as to_time_alias from pandas.core.tools.times import to_time +fails_on_zh_cn = pytest.mark.xfail( + locale.getlocale()[0] == "zh_CN", + reason="fail on a CI build with LC_ALL=zh_CN.utf8", +) + class TestToTime: - @td.skip_if_has_locale - def test_parsers_time(self): - # GH#11818 - strings = [ + @pytest.mark.parametrize( + "time_string", + [ "14:15", "1415", - "2:15pm", - "0215pm", + pytest.param("2:15pm", marks=fails_on_zh_cn), + pytest.param("0215pm", marks=fails_on_zh_cn), "14:15:00", "141500", - "2:15:00pm", - "021500pm", + pytest.param("2:15:00pm", marks=fails_on_zh_cn), + pytest.param("021500pm", marks=fails_on_zh_cn), time(14, 15), - ] - expected = time(14, 15) - - for time_string in strings: - assert to_time(time_string) == expected + ], + ) + def test_parsers_time(self, time_string): + # GH#11818 + assert to_time(time_string) == time(14, 15) + def test_odd_format(self): new_string = "14.15" msg = r"Cannot convert arg \['14\.15'\] to a time" with pytest.raises(ValueError, match=msg): to_time(new_string) - assert to_time(new_string, format="%H.%M") == expected + assert to_time(new_string, format="%H.%M") == time(14, 15) + def test_arraylike(self): arg = ["14:15", "20:20"] expected_arr = [time(14, 15), time(20, 20)] assert to_time(arg) == expected_arr diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 7b35e8d55c3381..ec6fccd42dbc95 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -30,21 +30,23 @@ def test_to_timedelta_readonly(self, readonly): expected = to_timedelta([]) tm.assert_index_equal(result, expected) - def test_to_timedelta(self): - + def test_to_timedelta_null(self): result = to_timedelta(["", ""]) assert isna(result).all() + def test_to_timedelta_same_np_timedelta64(self): # pass thru result = to_timedelta(np.array([np.timedelta64(1, "s")])) expected = pd.Index(np.array([np.timedelta64(1, "s")])) tm.assert_index_equal(result, expected) + def test_to_timedelta_series(self): # Series expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) result = to_timedelta(Series(["1d", "1days 00:00:01"])) tm.assert_series_equal(result, expected) + def test_to_timedelta_units(self): # with units result = TimedeltaIndex( [np.timedelta64(0, "ns"), np.timedelta64(10, "s").astype("m8[ns]")] @@ -52,30 +54,21 @@ def test_to_timedelta(self): expected = to_timedelta([0, 10], unit="s") tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "dtype, unit", + [ + ["int64", "s"], + ["int64", "m"], + ["int64", "h"], + ["timedelta64[s]", "s"], + ["timedelta64[D]", "D"], + ], + ) + def test_to_timedelta_units_dtypes(self, dtype, unit): # arrays of various dtypes - arr = np.array([1] * 5, dtype="int64") - result = to_timedelta(arr, unit="s") - expected = TimedeltaIndex([np.timedelta64(1, "s")] * 5) - tm.assert_index_equal(result, expected) - - arr = np.array([1] * 5, dtype="int64") - result = to_timedelta(arr, unit="m") - expected = TimedeltaIndex([np.timedelta64(1, "m")] * 5) - tm.assert_index_equal(result, expected) - - arr = np.array([1] * 5, dtype="int64") - result = to_timedelta(arr, unit="h") - expected = TimedeltaIndex([np.timedelta64(1, "h")] * 5) - tm.assert_index_equal(result, expected) - - arr = np.array([1] * 5, dtype="timedelta64[s]") - result = to_timedelta(arr) - expected = TimedeltaIndex([np.timedelta64(1, "s")] * 5) - tm.assert_index_equal(result, expected) - - arr = np.array([1] * 5, dtype="timedelta64[D]") - result = to_timedelta(arr) - expected = TimedeltaIndex([np.timedelta64(1, "D")] * 5) + arr = np.array([1] * 5, dtype=dtype) + result = to_timedelta(arr, unit=unit) + expected = TimedeltaIndex([np.timedelta64(1, unit)] * 5) tm.assert_index_equal(result, expected) def test_to_timedelta_oob_non_nano(self): @@ -91,31 +84,30 @@ def test_to_timedelta_oob_non_nano(self): with pytest.raises(OutOfBoundsTimedelta, match=msg): TimedeltaArray._from_sequence(arr) - def test_to_timedelta_dataframe(self): + @pytest.mark.parametrize( + "arg", [np.arange(10).reshape(2, 5), pd.DataFrame(np.arange(10).reshape(2, 5))] + ) + @pytest.mark.parametrize("errors", ["ignore", "raise", "coerce"]) + def test_to_timedelta_dataframe(self, arg, errors): # GH 11776 - arr = np.arange(10).reshape(2, 5) - df = pd.DataFrame(np.arange(10).reshape(2, 5)) - for arg in (arr, df): - with pytest.raises(TypeError, match="1-d array"): - to_timedelta(arg) - for errors in ["ignore", "raise", "coerce"]: - with pytest.raises(TypeError, match="1-d array"): - to_timedelta(arg, errors=errors) + with pytest.raises(TypeError, match="1-d array"): + to_timedelta(arg, errors=errors) - def test_to_timedelta_invalid(self): + def test_to_timedelta_invalid_errors(self): # bad value for errors parameter msg = "errors must be one of" with pytest.raises(ValueError, match=msg): to_timedelta(["foo"], errors="never") + @pytest.mark.parametrize("arg", [[1, 2], 1]) + def test_to_timedelta_invalid_unit(self, arg): # these will error msg = "invalid unit abbreviation: foo" with pytest.raises(ValueError, match=msg): - to_timedelta([1, 2], unit="foo") - with pytest.raises(ValueError, match=msg): - to_timedelta(1, unit="foo") + to_timedelta(arg, unit="foo") + def test_to_timedelta_time(self): # time not supported ATM msg = ( "Value must be Timedelta, string, integer, float, timedelta or convertible" @@ -124,10 +116,12 @@ def test_to_timedelta_invalid(self): to_timedelta(time(second=1)) assert to_timedelta(time(second=1), errors="coerce") is pd.NaT + def test_to_timedelta_bad_value(self): msg = "Could not convert 'foo' to NumPy timedelta" with pytest.raises(ValueError, match=msg): to_timedelta(["foo", "bar"]) + def test_to_timedelta_bad_value_coerce(self): tm.assert_index_equal( TimedeltaIndex([pd.NaT, pd.NaT]), to_timedelta(["foo", "bar"], errors="coerce"), @@ -138,6 +132,7 @@ def test_to_timedelta_invalid(self): to_timedelta(["1 day", "bar", "1 min"], errors="coerce"), ) + def test_to_timedelta_invalid_errors_ignore(self): # gh-13613: these should not error because errors='ignore' invalid_data = "apple" assert invalid_data == to_timedelta(invalid_data, errors="ignore") @@ -213,11 +208,10 @@ def test_to_timedelta_on_missing_values(self): actual = to_timedelta(ser) tm.assert_series_equal(actual, expected) - actual = to_timedelta(np.nan) - assert actual.value == timedelta_NaT.astype("int64") - - actual = to_timedelta(pd.NaT) - assert actual.value == timedelta_NaT.astype("int64") + @pytest.mark.parametrize("val", [np.nan, pd.NaT]) + def test_to_timedelta_on_missing_values_scalar(self, val): + actual = to_timedelta(val) + assert actual.value == np.timedelta64("NaT").astype("int64") def test_to_timedelta_float(self): # https://github.com/pandas-dev/pandas/issues/25077 @@ -237,16 +231,13 @@ def test_to_timedelta_ignore_strings_unit(self): result = to_timedelta(arr, unit="ns", errors="ignore") tm.assert_numpy_array_equal(result, arr) - def test_to_timedelta_nullable_int64_dtype(self): + @pytest.mark.parametrize( + "expected_val, result_val", [[timedelta(days=2), 2], [None, None]] + ) + def test_to_timedelta_nullable_int64_dtype(self, expected_val, result_val): # GH 35574 - expected = Series([timedelta(days=1), timedelta(days=2)]) - result = to_timedelta(Series([1, 2], dtype="Int64"), unit="days") - - tm.assert_series_equal(result, expected) - - # IntegerArray Series with nulls - expected = Series([timedelta(days=1), None]) - result = to_timedelta(Series([1, None], dtype="Int64"), unit="days") + expected = Series([timedelta(days=1), expected_val]) + result = to_timedelta(Series([1, result_val], dtype="Int64"), unit="days") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index cbbe29fb6cf9ab..f8fde000354e08 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -28,22 +28,36 @@ import pandas.tseries.offsets as offsets -def _check_generated_range(start, periods, freq): - """ - Check the range generated from a given start, frequency, and period count. - - Parameters - ---------- - start : str - The start date. - periods : int - The number of periods. - freq : str - The frequency of the range. - """ +@pytest.fixture( + params=[ + (timedelta(1), "D"), + (timedelta(hours=1), "H"), + (timedelta(minutes=1), "T"), + (timedelta(seconds=1), "S"), + (np.timedelta64(1, "ns"), "N"), + (timedelta(microseconds=1), "U"), + (timedelta(microseconds=1000), "L"), + ] +) +def base_delta_code_pair(request): + return request.param + + +freqs = ( + [f"Q-{month}" for month in MONTHS] + + [f"{annual}-{month}" for annual in ["A", "BA"] for month in MONTHS] + + ["M", "BM", "BMS"] + + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS] + + [f"W-{day}" for day in DAYS] +) + + +@pytest.mark.parametrize("freq", freqs) +@pytest.mark.parametrize("periods", [5, 7]) +def test_infer_freq_range(periods, freq): freq = freq.upper() - gen = date_range(start, periods=periods, freq=freq) + gen = date_range("1/1/2000", periods=periods, freq=freq) index = DatetimeIndex(gen.values) if not freq.startswith("Q-"): @@ -72,41 +86,6 @@ def _check_generated_range(start, periods, freq): assert is_dec_range or is_nov_range or is_oct_range -@pytest.fixture( - params=[ - (timedelta(1), "D"), - (timedelta(hours=1), "H"), - (timedelta(minutes=1), "T"), - (timedelta(seconds=1), "S"), - (np.timedelta64(1, "ns"), "N"), - (timedelta(microseconds=1), "U"), - (timedelta(microseconds=1000), "L"), - ] -) -def base_delta_code_pair(request): - return request.param - - -@pytest.fixture(params=[1, 2, 3, 4]) -def count(request): - return request.param - - -@pytest.fixture(params=DAYS) -def day(request): - return request.param - - -@pytest.fixture(params=MONTHS) -def month(request): - return request.param - - -@pytest.fixture(params=[5, 7]) -def periods(request): - return request.param - - def test_raise_if_period_index(): index = period_range(start="1/1/1990", periods=20, freq="M") msg = "Check the `freq` attribute instead of using infer_freq" @@ -184,6 +163,7 @@ def test_annual_ambiguous(): assert rng.inferred_freq == "A-JAN" +@pytest.mark.parametrize("count", range(1, 5)) def test_infer_freq_delta(base_delta_code_pair, count): b = Timestamp(datetime.now()) base_delta, code = base_delta_code_pair @@ -214,28 +194,6 @@ def test_infer_freq_custom(base_delta_code_pair, constructor): assert frequencies.infer_freq(index) is None -def test_weekly_infer(periods, day): - _check_generated_range("1/1/2000", periods, f"W-{day}") - - -def test_week_of_month_infer(periods, day, count): - _check_generated_range("1/1/2000", periods, f"WOM-{count}{day}") - - -@pytest.mark.parametrize("freq", ["M", "BM", "BMS"]) -def test_monthly_infer(periods, freq): - _check_generated_range("1/1/2000", periods, "M") - - -def test_quarterly_infer(month, periods): - _check_generated_range("1/1/2000", periods, f"Q-{month}") - - -@pytest.mark.parametrize("annual", ["A", "BA"]) -def test_annually_infer(month, periods, annual): - _check_generated_range("1/1/2000", periods, f"{annual}-{month}") - - @pytest.mark.parametrize( "freq,expected", [("Q", "Q-DEC"), ("Q-NOV", "Q-NOV"), ("Q-OCT", "Q-OCT")] ) diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index 8ed88d55935470..cefb2f86703b23 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -26,29 +26,6 @@ ) -def _check_holiday_results(holiday, start, end, expected): - """ - Check that the dates for a given holiday match in date and timezone. - - Parameters - ---------- - holiday : Holiday - The holiday to check. - start : datetime-like - The start date of range in which to collect dates for a given holiday. - end : datetime-like - The end date of range in which to collect dates for a given holiday. - expected : list - The list of dates we expect to get. - """ - assert list(holiday.dates(start, end)) == expected - - # Verify that timezone info is preserved. - assert list( - holiday.dates(utc.localize(Timestamp(start)), utc.localize(Timestamp(end))) - ) == [utc.localize(dt) for dt in expected] - - @pytest.mark.parametrize( "holiday,start_date,end_date,expected", [ @@ -141,46 +118,61 @@ def _check_holiday_results(holiday, start, end, expected): ], ) def test_holiday_dates(holiday, start_date, end_date, expected): - _check_holiday_results(holiday, start_date, end_date, expected) + assert list(holiday.dates(start_date, end_date)) == expected + + # Verify that timezone info is preserved. + assert list( + holiday.dates( + utc.localize(Timestamp(start_date)), utc.localize(Timestamp(end_date)) + ) + ) == [utc.localize(dt) for dt in expected] @pytest.mark.parametrize( "holiday,start,expected", [ (USMemorialDay, datetime(2015, 7, 1), []), - (USMemorialDay, "2015-05-25", "2015-05-25"), + (USMemorialDay, "2015-05-25", [Timestamp("2015-05-25")]), (USLaborDay, datetime(2015, 7, 1), []), - (USLaborDay, "2015-09-07", "2015-09-07"), + (USLaborDay, "2015-09-07", [Timestamp("2015-09-07")]), (USColumbusDay, datetime(2015, 7, 1), []), - (USColumbusDay, "2015-10-12", "2015-10-12"), + (USColumbusDay, "2015-10-12", [Timestamp("2015-10-12")]), (USThanksgivingDay, datetime(2015, 7, 1), []), - (USThanksgivingDay, "2015-11-26", "2015-11-26"), + (USThanksgivingDay, "2015-11-26", [Timestamp("2015-11-26")]), (USMartinLutherKingJr, datetime(2015, 7, 1), []), - (USMartinLutherKingJr, "2015-01-19", "2015-01-19"), + (USMartinLutherKingJr, "2015-01-19", [Timestamp("2015-01-19")]), (USPresidentsDay, datetime(2015, 7, 1), []), - (USPresidentsDay, "2015-02-16", "2015-02-16"), + (USPresidentsDay, "2015-02-16", [Timestamp("2015-02-16")]), (GoodFriday, datetime(2015, 7, 1), []), - (GoodFriday, "2015-04-03", "2015-04-03"), - (EasterMonday, "2015-04-06", "2015-04-06"), + (GoodFriday, "2015-04-03", [Timestamp("2015-04-03")]), + (EasterMonday, "2015-04-06", [Timestamp("2015-04-06")]), (EasterMonday, datetime(2015, 7, 1), []), (EasterMonday, "2015-04-05", []), - ("New Year's Day", "2015-01-01", "2015-01-01"), - ("New Year's Day", "2010-12-31", "2010-12-31"), + ("New Year's Day", "2015-01-01", [Timestamp("2015-01-01")]), + ("New Year's Day", "2010-12-31", [Timestamp("2010-12-31")]), ("New Year's Day", datetime(2015, 7, 1), []), ("New Year's Day", "2011-01-01", []), - ("Independence Day", "2015-07-03", "2015-07-03"), + ("Independence Day", "2015-07-03", [Timestamp("2015-07-03")]), ("Independence Day", datetime(2015, 7, 1), []), ("Independence Day", "2015-07-04", []), - ("Veterans Day", "2012-11-12", "2012-11-12"), + ("Veterans Day", "2012-11-12", [Timestamp("2012-11-12")]), ("Veterans Day", datetime(2015, 7, 1), []), ("Veterans Day", "2012-11-11", []), - ("Christmas Day", "2011-12-26", "2011-12-26"), + ("Christmas Day", "2011-12-26", [Timestamp("2011-12-26")]), ("Christmas Day", datetime(2015, 7, 1), []), ("Christmas Day", "2011-12-25", []), ("Juneteenth National Independence Day", "2020-06-19", []), - ("Juneteenth National Independence Day", "2021-06-18", "2021-06-18"), + ( + "Juneteenth National Independence Day", + "2021-06-18", + [Timestamp("2021-06-18")], + ), ("Juneteenth National Independence Day", "2022-06-19", []), - ("Juneteenth National Independence Day", "2022-06-20", "2022-06-20"), + ( + "Juneteenth National Independence Day", + "2022-06-20", + [Timestamp("2022-06-20")], + ), ], ) def test_holidays_within_dates(holiday, start, expected): @@ -193,10 +185,12 @@ def test_holidays_within_dates(holiday, start, expected): calendar = get_calendar("USFederalHolidayCalendar") holiday = calendar.rule_from_name(holiday) - if isinstance(expected, str): - expected = [Timestamp(expected)] + assert list(holiday.dates(start, start)) == expected - _check_holiday_results(holiday, start, start, expected) + # Verify that timezone info is preserved. + assert list( + holiday.dates(utc.localize(Timestamp(start)), utc.localize(Timestamp(start))) + ) == [utc.localize(dt) for dt in expected] @pytest.mark.parametrize( diff --git a/pandas/tests/tseries/offsets/test_business_day.py b/pandas/tests/tseries/offsets/test_business_day.py index c40ae611687dd0..482d697b15e98e 100644 --- a/pandas/tests/tseries/offsets/test_business_day.py +++ b/pandas/tests/tseries/offsets/test_business_day.py @@ -7,6 +7,7 @@ timedelta, ) +import numpy as np import pytest from pandas._libs.tslibs.offsets import ( @@ -14,7 +15,6 @@ BDay, BMonthEnd, ) -from pandas.compat import np_datetime64_compat from pandas import ( DatetimeIndex, @@ -36,7 +36,7 @@ class TestBusinessDay(Base): def setup_method(self, method): self.d = datetime(2008, 1, 1) - self.nd = np_datetime64_compat("2008-01-01 00:00:00Z") + self.nd = np.datetime64("2008-01-01 00:00:00") self.offset = self._offset() self.offset1 = self.offset diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 134ba79e7773d4..5dcfd0019e93fa 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -28,7 +28,6 @@ _offset_map, ) from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG -from pandas.compat import np_datetime64_compat from pandas.errors import PerformanceWarning from pandas import DatetimeIndex @@ -101,7 +100,7 @@ class TestCommon(Base): "Second": Timestamp("2011-01-01 09:00:01"), "Milli": Timestamp("2011-01-01 09:00:00.001000"), "Micro": Timestamp("2011-01-01 09:00:00.000001"), - "Nano": Timestamp(np_datetime64_compat("2011-01-01T09:00:00.000000001Z")), + "Nano": Timestamp("2011-01-01T09:00:00.000000001"), } def test_immutable(self, offset_types): @@ -252,7 +251,7 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=Fals def test_apply(self, offset_types): sdt = datetime(2011, 1, 1, 9, 0) - ndt = np_datetime64_compat("2011-01-01 09:00Z") + ndt = np.datetime64("2011-01-01 09:00") expected = self.expecteds[offset_types.__name__] expected_norm = Timestamp(expected.date()) @@ -309,7 +308,7 @@ def test_rollforward(self, offset_types): norm_expected.update(normalized) sdt = datetime(2011, 1, 1, 9, 0) - ndt = np_datetime64_compat("2011-01-01 09:00Z") + ndt = np.datetime64("2011-01-01 09:00") for dt in [sdt, ndt]: expected = expecteds[offset_types.__name__] @@ -383,7 +382,7 @@ def test_rollback(self, offset_types): norm_expected.update(normalized) sdt = datetime(2011, 1, 1, 9, 0) - ndt = np_datetime64_compat("2011-01-01 09:00Z") + ndt = np.datetime64("2011-01-01 09:00") for dt in [sdt, ndt]: expected = expecteds[offset_types.__name__] @@ -669,14 +668,6 @@ def test_rule_code(self): assert alias == (_get_offset(alias) * 5).rule_code -def test_dateoffset_misc(): - oset = offsets.DateOffset(months=2, days=4) - # it works - oset.freqstr - - assert not offsets.DateOffset(months=2) == 2 - - def test_freq_offsets(): off = BDay(1, offset=timedelta(0, 1800)) assert off.freqstr == "B+30Min" @@ -792,6 +783,54 @@ def test_tick_normalize_raises(tick_classes): cls(n=3, normalize=True) +@pytest.mark.parametrize( + "offset_kwargs, expected_arg", + [ + ({"nanoseconds": 1}, "1970-01-01 00:00:00.000000001"), + ({"nanoseconds": 5}, "1970-01-01 00:00:00.000000005"), + ({"nanoseconds": -1}, "1969-12-31 23:59:59.999999999"), + ({"microseconds": 1}, "1970-01-01 00:00:00.000001"), + ({"microseconds": -1}, "1969-12-31 23:59:59.999999"), + ({"seconds": 1}, "1970-01-01 00:00:01"), + ({"seconds": -1}, "1969-12-31 23:59:59"), + ({"minutes": 1}, "1970-01-01 00:01:00"), + ({"minutes": -1}, "1969-12-31 23:59:00"), + ({"hours": 1}, "1970-01-01 01:00:00"), + ({"hours": -1}, "1969-12-31 23:00:00"), + ({"days": 1}, "1970-01-02 00:00:00"), + ({"days": -1}, "1969-12-31 00:00:00"), + ({"weeks": 1}, "1970-01-08 00:00:00"), + ({"weeks": -1}, "1969-12-25 00:00:00"), + ({"months": 1}, "1970-02-01 00:00:00"), + ({"months": -1}, "1969-12-01 00:00:00"), + ({"years": 1}, "1971-01-01 00:00:00"), + ({"years": -1}, "1969-01-01 00:00:00"), + ], +) +def test_dateoffset_add_sub(offset_kwargs, expected_arg): + offset = DateOffset(**offset_kwargs) + ts = Timestamp(0) + result = ts + offset + expected = Timestamp(expected_arg) + assert result == expected + result -= offset + assert result == ts + result = offset + ts + assert result == expected + + +def test_dataoffset_add_sub_timestamp_with_nano(): + offset = DateOffset(minutes=2, nanoseconds=9) + ts = Timestamp(4) + result = ts + offset + expected = Timestamp("1970-01-01 00:02:00.000000013") + assert result == expected + result -= offset + assert result == ts + result = offset + ts + assert result == expected + + @pytest.mark.parametrize( "attribute", [ @@ -807,3 +846,11 @@ def test_dateoffset_immutable(attribute): msg = "DateOffset objects are immutable" with pytest.raises(AttributeError, match=msg): setattr(offset, attribute, 5) + + +def test_dateoffset_misc(): + oset = offsets.DateOffset(months=2, days=4) + # it works + oset.freqstr + + assert not offsets.DateOffset(months=2) == 2 diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index ef9f2390922ff3..1b4fa9292c4031 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -11,7 +11,6 @@ assume, given, ) -from hypothesis.errors import Flaky import pytest import pytz @@ -45,7 +44,6 @@ def test_on_offset_implementations(dt, offset): assert offset.is_on_offset(dt) == (compare == dt) -@pytest.mark.xfail(strict=False, raises=Flaky, reason="unreliable test timings") @given(YQM_OFFSET) def test_shift_across_dst(offset): # GH#18319 check that 1) timezone is correctly normalized and diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 8c2f0b09c461eb..a0fafc227e001c 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -12,7 +12,6 @@ iNaT, tslib, ) -from pandas.compat import np_array_datetime64_compat from pandas import Timestamp import pandas._testing as tm @@ -24,15 +23,15 @@ ( ["01-01-2013", "01-02-2013"], [ - "2013-01-01T00:00:00.000000000-0000", - "2013-01-02T00:00:00.000000000-0000", + "2013-01-01T00:00:00.000000000", + "2013-01-02T00:00:00.000000000", ], ), ( ["Mon Sep 16 2013", "Tue Sep 17 2013"], [ - "2013-09-16T00:00:00.000000000-0000", - "2013-09-17T00:00:00.000000000-0000", + "2013-09-16T00:00:00.000000000", + "2013-09-17T00:00:00.000000000", ], ), ], @@ -41,7 +40,7 @@ def test_parsing_valid_dates(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np_array_datetime64_compat(expected, dtype="M8[ns]") + expected = np.array(expected, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) @@ -141,8 +140,8 @@ def test_coerce_outside_ns_bounds_one_valid(): arr = np.array(["1/1/1000", "1/1/2000"], dtype=object) result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = [iNaT, "2000-01-01T00:00:00.000000000-0000"] - expected = np_array_datetime64_compat(expected, dtype="M8[ns]") + expected = [iNaT, "2000-01-01T00:00:00.000000000"] + expected = np.array(expected, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) @@ -160,11 +159,9 @@ def test_coerce_of_invalid_datetimes(errors): else: # coerce. # With coercing, the invalid dates becomes iNaT result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = ["2013-01-01T00:00:00.000000000-0000", iNaT, iNaT] + expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] - tm.assert_numpy_array_equal( - result, np_array_datetime64_compat(expected, dtype="M8[ns]") - ) + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[ns]")) def test_to_datetime_barely_out_of_bounds(): @@ -186,9 +183,9 @@ class SubDatetime(datetime): @pytest.mark.parametrize( "data,expected", [ - ([SubDatetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), - ([datetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), - ([Timestamp(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), + ([SubDatetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000"]), + ([datetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000"]), + ([Timestamp(2000, 1, 1)], ["2000-01-01T00:00:00.000000000"]), ], ) def test_datetime_subclass(data, expected): @@ -199,5 +196,5 @@ def test_datetime_subclass(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np_array_datetime64_compat(expected, dtype="M8[ns]") + expected = np.array(expected, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/tslibs/test_fields.py b/pandas/tests/tslibs/test_fields.py index e5fe998923f8d5..9d0b3ff4fca800 100644 --- a/pandas/tests/tslibs/test_fields.py +++ b/pandas/tests/tslibs/test_fields.py @@ -1,28 +1,39 @@ import numpy as np +import pytest from pandas._libs.tslibs import fields import pandas._testing as tm -def test_fields_readonly(): - # https://github.com/vaexio/vaex/issues/357 - # fields functions shouldn't raise when we pass read-only data +@pytest.fixture +def dtindex(): dtindex = np.arange(5, dtype=np.int64) * 10 ** 9 * 3600 * 24 * 32 dtindex.flags.writeable = False + return dtindex + +def test_get_date_name_field_readonly(dtindex): + # https://github.com/vaexio/vaex/issues/357 + # fields functions shouldn't raise when we pass read-only data result = fields.get_date_name_field(dtindex, "month_name") expected = np.array(["January", "February", "March", "April", "May"], dtype=object) tm.assert_numpy_array_equal(result, expected) + +def test_get_date_field_readonly(dtindex): result = fields.get_date_field(dtindex, "Y") expected = np.array([1970, 1970, 1970, 1970, 1970], dtype=np.int32) tm.assert_numpy_array_equal(result, expected) + +def test_get_start_end_field_readonly(dtindex): result = fields.get_start_end_field(dtindex, "is_month_start", None) expected = np.array([True, False, False, False, False], dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) + +def test_get_timedelta_field_readonly(dtindex): # treat dtindex as timedeltas for this next one result = fields.get_timedelta_field(dtindex, "days") expected = np.arange(5, dtype=np.int32) * 32 diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index e4a46de11ceb7f..6eee756f67a2ea 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -37,39 +37,6 @@ def index(request): return request.param -def _check_equal(obj, **kwargs): - """ - Check that hashing an objects produces the same value each time. - - Parameters - ---------- - obj : object - The object to hash. - kwargs : kwargs - Keyword arguments to pass to the hashing function. - """ - a = hash_pandas_object(obj, **kwargs) - b = hash_pandas_object(obj, **kwargs) - tm.assert_series_equal(a, b) - - -def _check_not_equal_with_index(obj): - """ - Check the hash of an object with and without its index is not the same. - - Parameters - ---------- - obj : object - The object to hash. - """ - if not isinstance(obj, Index): - a = hash_pandas_object(obj, index=True) - b = hash_pandas_object(obj, index=False) - - if len(obj): - assert not (a == b).all() - - def test_consistency(): # Check that our hash doesn't change because of a mistake # in the actual code; this is the ground truth. @@ -89,12 +56,10 @@ def test_hash_array(series): tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr)) -@pytest.mark.parametrize( - "arr2", [np.array([3, 4, "All"], dtype="U"), np.array([3, 4, "All"], dtype=object)] -) -def test_hash_array_mixed(arr2): +@pytest.mark.parametrize("dtype", ["U", object]) +def test_hash_array_mixed(dtype): result1 = hash_array(np.array(["3", "4", "All"])) - result2 = hash_array(arr2) + result2 = hash_array(np.array([3, 4, "All"], dtype=dtype)) tm.assert_numpy_array_equal(result1, result2) @@ -159,32 +124,77 @@ def test_multiindex_objects(): Series(["a", None, "c"]), Series([True, False, True]), Series(dtype=object), - Index([1, 2, 3]), - Index([True, False, True]), DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), - tm.makeTimedeltaIndex(), - tm.makePeriodIndex(), Series(tm.makePeriodIndex()), Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), + ], +) +def test_hash_pandas_object(obj, index): + a = hash_pandas_object(obj, index=index) + b = hash_pandas_object(obj, index=index) + tm.assert_series_equal(a, b) + + +@pytest.mark.parametrize( + "obj", + [ + Series([1, 2, 3]), + Series([1.0, 1.5, 3.2]), + Series([1.0, 1.5, np.nan]), + Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), + Series(["a", "b", "c"]), + Series(["a", np.nan, "c"]), + Series(["a", None, "c"]), + Series([True, False, True]), + DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), + tm.makeMissingDataframe(), + tm.makeMixedDataFrame(), + tm.makeTimeDataFrame(), + tm.makeTimeSeries(), + Series(tm.makePeriodIndex()), + Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), + ], +) +def test_hash_pandas_object_diff_index_non_empty(obj): + a = hash_pandas_object(obj, index=True) + b = hash_pandas_object(obj, index=False) + assert not (a == b).all() + + +@pytest.mark.parametrize( + "obj", + [ + Index([1, 2, 3]), + Index([True, False, True]), + tm.makeTimedeltaIndex(), + tm.makePeriodIndex(), MultiIndex.from_product( [range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)] ), MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]), ], ) -def test_hash_pandas_object(obj, index): - _check_equal(obj, index=index) - _check_not_equal_with_index(obj) +def test_hash_pandas_index(obj, index): + a = hash_pandas_object(obj, index=index) + b = hash_pandas_object(obj, index=index) + tm.assert_series_equal(a, b) -def test_hash_pandas_object2(series, index): - _check_equal(series, index=index) - _check_not_equal_with_index(series) +def test_hash_pandas_series(series, index): + a = hash_pandas_object(series, index=index) + b = hash_pandas_object(series, index=index) + tm.assert_series_equal(a, b) + + +def test_hash_pandas_series_diff_index(series): + a = hash_pandas_object(series, index=True) + b = hash_pandas_object(series, index=False) + assert not (a == b).all() @pytest.mark.parametrize( @@ -193,7 +203,9 @@ def test_hash_pandas_object2(series, index): def test_hash_pandas_empty_object(obj, index): # These are by-definition the same with # or without the index as the data is empty. - _check_equal(obj, index=index) + a = hash_pandas_object(obj, index=index) + b = hash_pandas_object(obj, index=index) + tm.assert_series_equal(a, b) @pytest.mark.parametrize( @@ -235,11 +247,10 @@ def test_categorical_with_nan_consistency(): assert result[1] in expected -@pytest.mark.parametrize("obj", [pd.Timestamp("20130101")]) -def test_pandas_errors(obj): +def test_pandas_errors(): msg = "Unexpected type for hashing" with pytest.raises(TypeError, match=msg): - hash_pandas_object(obj) + hash_pandas_object(pd.Timestamp("20130101")) def test_hash_keys(): @@ -292,12 +303,16 @@ def test_invalid_key(): def test_already_encoded(index): # If already encoded, then ok. obj = Series(list("abc")).str.encode("utf8") - _check_equal(obj, index=index) + a = hash_pandas_object(obj, index=index) + b = hash_pandas_object(obj, index=index) + tm.assert_series_equal(a, b) def test_alternate_encoding(index): obj = Series(list("abc")) - _check_equal(obj, index=index, encoding="ascii") + a = hash_pandas_object(obj, index=index) + b = hash_pandas_object(obj, index=index) + tm.assert_series_equal(a, b) @pytest.mark.parametrize("l_exp", range(8)) @@ -332,20 +347,24 @@ def test_hash_collisions(): tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0)) -def test_hash_with_tuple(): +@pytest.mark.parametrize( + "data, result_data", + [ + [[tuple("1"), tuple("2")], [10345501319357378243, 8331063931016360761]], + [[(1,), (2,)], [9408946347443669104, 3278256261030523334]], + ], +) +def test_hash_with_tuple(data, result_data): # GH#28969 array containing a tuple raises on call to arr.astype(str) # apparently a numpy bug github.com/numpy/numpy/issues/9441 - df = DataFrame({"data": [tuple("1"), tuple("2")]}) + df = DataFrame({"data": data}) result = hash_pandas_object(df) - expected = Series([10345501319357378243, 8331063931016360761], dtype=np.uint64) + expected = Series(result_data, dtype=np.uint64) tm.assert_series_equal(result, expected) - df2 = DataFrame({"data": [(1,), (2,)]}) - result = hash_pandas_object(df2) - expected = Series([9408946347443669104, 3278256261030523334], dtype=np.uint64) - tm.assert_series_equal(result, expected) +def test_hashable_tuple_args(): # require that the elements of such tuples are themselves hashable df3 = DataFrame( diff --git a/pandas/tests/window/moments/conftest.py b/pandas/tests/window/moments/conftest.py index b192f72c8f08b2..8f7c20fe03a021 100644 --- a/pandas/tests/window/moments/conftest.py +++ b/pandas/tests/window/moments/conftest.py @@ -10,41 +10,50 @@ ) -# create the data only once as we are not setting it -def _create_consistency_data(): - def create_series(): - return [ - Series(dtype=np.float64, name="a"), - Series([np.nan] * 5), - Series([1.0] * 5), - Series(range(5, 0, -1)), - Series(range(5)), - Series([np.nan, 1.0, np.nan, 1.0, 1.0]), - Series([np.nan, 1.0, np.nan, 2.0, 3.0]), - Series([np.nan, 1.0, np.nan, 3.0, 2.0]), - ] - - def create_dataframes(): - return [ - DataFrame(columns=["a", "a"]), - DataFrame(np.arange(15).reshape((5, 3)), columns=["a", "a", 99]), - ] + [DataFrame(s) for s in create_series()] - - def is_constant(x): - values = x.values.ravel("K") - return len(set(values[notna(values)])) == 1 - - def no_nans(x): - return x.notna().all().all() - +def create_series(): return [ - (x, is_constant(x), no_nans(x)) - for x in itertools.chain(create_dataframes(), create_dataframes()) + Series(dtype=np.float64, name="a"), + Series([np.nan] * 5), + Series([1.0] * 5), + Series(range(5, 0, -1)), + Series(range(5)), + Series([np.nan, 1.0, np.nan, 1.0, 1.0]), + Series([np.nan, 1.0, np.nan, 2.0, 3.0]), + Series([np.nan, 1.0, np.nan, 3.0, 2.0]), ] -@pytest.fixture(params=_create_consistency_data()) -def consistency_data(request): +def create_dataframes(): + return [ + DataFrame(columns=["a", "a"]), + DataFrame(np.arange(15).reshape((5, 3)), columns=["a", "a", 99]), + ] + [DataFrame(s) for s in create_series()] + + +def is_constant(x): + values = x.values.ravel("K") + return len(set(values[notna(values)])) == 1 + + +@pytest.fixture( + params=( + obj + for obj in itertools.chain(create_series(), create_dataframes()) + if is_constant(obj) + ), + scope="module", +) +def consistent_data(request): + return request.param + + +@pytest.fixture(params=create_series()) +def series_data(request): + return request.param + + +@pytest.fixture(params=itertools.chain(create_series(), create_dataframes())) +def all_data(request): """ Test: - Empty Series / DataFrame diff --git a/pandas/tests/window/moments/test_moments_consistency_ewm.py b/pandas/tests/window/moments/test_moments_consistency_ewm.py index 8feec32ba99c55..f9f09bffb14b1b 100644 --- a/pandas/tests/window/moments/test_moments_consistency_ewm.py +++ b/pandas/tests/window/moments/test_moments_consistency_ewm.py @@ -30,7 +30,7 @@ def create_mock_weights(obj, com, adjust, ignore_na): def create_mock_series_weights(s, com, adjust, ignore_na): - w = Series(np.nan, index=s.index) + w = Series(np.nan, index=s.index, name=s.name) alpha = 1.0 / (1.0 + com) if adjust: count = 0 @@ -58,63 +58,66 @@ def create_mock_series_weights(s, com, adjust, ignore_na): return w -def test_ewm_consistency_mean(consistency_data, adjust, ignore_na, min_periods): - x, is_constant, no_nans = consistency_data +def test_ewm_consistency_mean(all_data, adjust, ignore_na, min_periods): com = 3.0 - result = x.ewm( + result = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).mean() - weights = create_mock_weights(x, com=com, adjust=adjust, ignore_na=ignore_na) + weights = create_mock_weights(all_data, com=com, adjust=adjust, ignore_na=ignore_na) expected = ( - x.multiply(weights).cumsum().divide(weights.cumsum()).fillna(method="ffill") + all_data.multiply(weights) + .cumsum() + .divide(weights.cumsum()) + .fillna(method="ffill") ) expected[ - x.expanding().count() < (max(min_periods, 1) if min_periods else 1) + all_data.expanding().count() < (max(min_periods, 1) if min_periods else 1) ] = np.nan tm.assert_equal(result, expected.astype("float64")) -def test_ewm_consistency_consistent(consistency_data, adjust, ignore_na, min_periods): - x, is_constant, no_nans = consistency_data +def test_ewm_consistency_consistent(consistent_data, adjust, ignore_na, min_periods): com = 3.0 - if is_constant: - count_x = x.expanding().count() - mean_x = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean() - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).corr(x) - exp = x.max() if isinstance(x, Series) else x.max().max() + count_x = consistent_data.expanding().count() + mean_x = consistent_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean() + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = consistent_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(consistent_data) + exp = ( + consistent_data.max() + if isinstance(consistent_data, Series) + else consistent_data.max().max() + ) - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) + # check mean of constant series + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) def test_ewm_consistency_var_debiasing_factors( - consistency_data, adjust, ignore_na, min_periods + all_data, adjust, ignore_na, min_periods ): - x, is_constant, no_nans = consistency_data com = 3.0 # check variance debiasing factors - var_unbiased_x = x.ewm( + var_unbiased_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).var(bias=False) - var_biased_x = x.ewm( + var_biased_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).var(bias=True) - weights = create_mock_weights(x, com=com, adjust=adjust, ignore_na=ignore_na) + weights = create_mock_weights(all_data, com=com, adjust=adjust, ignore_na=ignore_na) cum_sum = weights.cumsum().fillna(method="ffill") cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") numerator = cum_sum * cum_sum @@ -126,16 +129,13 @@ def test_ewm_consistency_var_debiasing_factors( @pytest.mark.parametrize("bias", [True, False]) -def test_moments_consistency_var( - consistency_data, adjust, ignore_na, min_periods, bias -): - x, is_constant, no_nans = consistency_data +def test_moments_consistency_var(all_data, adjust, ignore_na, min_periods, bias): com = 3.0 - mean_x = x.ewm( + mean_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).mean() - var_x = x.ewm( + var_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).var(bias=bias) assert not (var_x < 0).any().any() @@ -143,7 +143,7 @@ def test_moments_consistency_var( if bias: # check that biased var(x) == mean(x^2) - mean(x)^2 mean_x2 = ( - (x * x) + (all_data * all_data) .ewm(com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na) .mean() ) @@ -152,35 +152,32 @@ def test_moments_consistency_var( @pytest.mark.parametrize("bias", [True, False]) def test_moments_consistency_var_constant( - consistency_data, adjust, ignore_na, min_periods, bias + consistent_data, adjust, ignore_na, min_periods, bias ): - x, is_constant, no_nans = consistency_data com = 3.0 - if is_constant: - count_x = x.expanding(min_periods=min_periods).count() - var_x = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=bias) + count_x = consistent_data.expanding(min_periods=min_periods).count() + var_x = consistent_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=bias) - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if not bias: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if not bias: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) @pytest.mark.parametrize("bias", [True, False]) -def test_ewm_consistency_std(consistency_data, adjust, ignore_na, min_periods, bias): - x, is_constant, no_nans = consistency_data +def test_ewm_consistency_std(all_data, adjust, ignore_na, min_periods, bias): com = 3.0 - var_x = x.ewm( + var_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).var(bias=bias) assert not (var_x < 0).any().any() - std_x = x.ewm( + std_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).std(bias=bias) assert not (std_x < 0).any().any() @@ -188,9 +185,9 @@ def test_ewm_consistency_std(consistency_data, adjust, ignore_na, min_periods, b # check that var(x) == std(x)^2 tm.assert_equal(var_x, std_x * std_x) - cov_x_x = x.ewm( + cov_x_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(x, bias=bias) + ).cov(all_data, bias=bias) assert not (cov_x_x < 0).any().any() # check that var(x) == cov(x, x) @@ -199,57 +196,53 @@ def test_ewm_consistency_std(consistency_data, adjust, ignore_na, min_periods, b @pytest.mark.parametrize("bias", [True, False]) def test_ewm_consistency_series_cov_corr( - consistency_data, adjust, ignore_na, min_periods, bias + series_data, adjust, ignore_na, min_periods, bias ): - x, is_constant, no_nans = consistency_data com = 3.0 - if isinstance(x, Series): - var_x_plus_y = ( - (x + x) - .ewm(com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na) - .var(bias=bias) - ) - var_x = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=bias) - var_y = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=bias) - cov_x_y = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(x, bias=bias) - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) - - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - corr_x_y = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).corr(x, bias=bias) - std_x = x.ewm( + var_x_plus_y = ( + (series_data + series_data) + .ewm(com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na) + .var(bias=bias) + ) + var_x = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=bias) + var_y = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=bias) + cov_x_y = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(series_data, bias=bias) + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + corr_x_y = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(series_data, bias=bias) + std_x = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=bias) + std_y = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=bias) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + + if bias: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_x = series_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=bias) - std_y = x.ewm( + ).mean() + mean_y = series_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=bias) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if bias: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_x = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean() - mean_y = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean() - mean_x_times_y = ( - (x * x) - .ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ) - .mean() - ) - tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) + ).mean() + mean_x_times_y = ( + (series_data * series_data) + .ewm(com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na) + .mean() + ) + tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) diff --git a/pandas/tests/window/moments/test_moments_consistency_expanding.py b/pandas/tests/window/moments/test_moments_consistency_expanding.py index 14314f80f152cd..dafc60a057c0fe 100644 --- a/pandas/tests/window/moments/test_moments_consistency_expanding.py +++ b/pandas/tests/window/moments/test_moments_consistency_expanding.py @@ -5,67 +5,68 @@ import pandas._testing as tm +def no_nans(x): + return x.notna().all().all() + + +def all_na(x): + return x.isnull().all().all() + + @pytest.mark.parametrize("f", [lambda v: Series(v).sum(), np.nansum, np.sum]) -def test_expanding_apply_consistency_sum_nans(consistency_data, min_periods, f): - x, is_constant, no_nans = consistency_data - - if f is np.nansum and min_periods == 0: - pass - elif f is np.sum and not no_nans: - pass - else: - expanding_f_result = x.expanding(min_periods=min_periods).sum() - expanding_apply_f_result = x.expanding(min_periods=min_periods).apply( - func=f, raw=True - ) - tm.assert_equal(expanding_f_result, expanding_apply_f_result) +def test_expanding_apply_consistency_sum_nans(request, all_data, min_periods, f): + if f is np.sum: + if not no_nans(all_data) and not ( + all_na(all_data) and not all_data.empty and min_periods > 0 + ): + request.node.add_marker( + pytest.mark.xfail(reason="np.sum has different behavior with NaNs") + ) + expanding_f_result = all_data.expanding(min_periods=min_periods).sum() + expanding_apply_f_result = all_data.expanding(min_periods=min_periods).apply( + func=f, raw=True + ) + tm.assert_equal(expanding_f_result, expanding_apply_f_result) @pytest.mark.parametrize("ddof", [0, 1]) -def test_moments_consistency_var(consistency_data, min_periods, ddof): - x, is_constant, no_nans = consistency_data - - var_x = x.expanding(min_periods=min_periods).var(ddof=ddof) +def test_moments_consistency_var(all_data, min_periods, ddof): + var_x = all_data.expanding(min_periods=min_periods).var(ddof=ddof) assert not (var_x < 0).any().any() if ddof == 0: # check that biased var(x) == mean(x^2) - mean(x)^2 - mean_x2 = (x * x).expanding(min_periods=min_periods).mean() - mean_x = x.expanding(min_periods=min_periods).mean() + mean_x2 = (all_data * all_data).expanding(min_periods=min_periods).mean() + mean_x = all_data.expanding(min_periods=min_periods).mean() tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) @pytest.mark.parametrize("ddof", [0, 1]) -def test_moments_consistency_var_constant(consistency_data, min_periods, ddof): - x, is_constant, no_nans = consistency_data +def test_moments_consistency_var_constant(consistent_data, min_periods, ddof): + count_x = consistent_data.expanding(min_periods=min_periods).count() + var_x = consistent_data.expanding(min_periods=min_periods).var(ddof=ddof) - if is_constant: - count_x = x.expanding(min_periods=min_periods).count() - var_x = x.expanding(min_periods=min_periods).var(ddof=ddof) - - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if ddof == 1: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if ddof == 1: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) @pytest.mark.parametrize("ddof", [0, 1]) -def test_expanding_consistency_var_std_cov(consistency_data, min_periods, ddof): - x, is_constant, no_nans = consistency_data - - var_x = x.expanding(min_periods=min_periods).var(ddof=ddof) +def test_expanding_consistency_var_std_cov(all_data, min_periods, ddof): + var_x = all_data.expanding(min_periods=min_periods).var(ddof=ddof) assert not (var_x < 0).any().any() - std_x = x.expanding(min_periods=min_periods).std(ddof=ddof) + std_x = all_data.expanding(min_periods=min_periods).std(ddof=ddof) assert not (std_x < 0).any().any() # check that var(x) == std(x)^2 tm.assert_equal(var_x, std_x * std_x) - cov_x_x = x.expanding(min_periods=min_periods).cov(x, ddof=ddof) + cov_x_x = all_data.expanding(min_periods=min_periods).cov(all_data, ddof=ddof) assert not (cov_x_x < 0).any().any() # check that var(x) == cov(x, x) @@ -73,73 +74,71 @@ def test_expanding_consistency_var_std_cov(consistency_data, min_periods, ddof): @pytest.mark.parametrize("ddof", [0, 1]) -def test_expanding_consistency_series_cov_corr(consistency_data, min_periods, ddof): - x, is_constant, no_nans = consistency_data - - if isinstance(x, Series): - var_x_plus_y = (x + x).expanding(min_periods=min_periods).var(ddof=ddof) - var_x = x.expanding(min_periods=min_periods).var(ddof=ddof) - var_y = x.expanding(min_periods=min_periods).var(ddof=ddof) - cov_x_y = x.expanding(min_periods=min_periods).cov(x, ddof=ddof) - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) - - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - corr_x_y = x.expanding(min_periods=min_periods).corr(x) - std_x = x.expanding(min_periods=min_periods).std(ddof=ddof) - std_y = x.expanding(min_periods=min_periods).std(ddof=ddof) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if ddof == 0: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_x = x.expanding(min_periods=min_periods).mean() - mean_y = x.expanding(min_periods=min_periods).mean() - mean_x_times_y = (x * x).expanding(min_periods=min_periods).mean() - tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) - - -def test_expanding_consistency_mean(consistency_data, min_periods): - x, is_constant, no_nans = consistency_data - - result = x.expanding(min_periods=min_periods).mean() - expected = ( - x.expanding(min_periods=min_periods).sum() - / x.expanding(min_periods=min_periods).count() +def test_expanding_consistency_series_cov_corr(series_data, min_periods, ddof): + var_x_plus_y = ( + (series_data + series_data).expanding(min_periods=min_periods).var(ddof=ddof) ) - tm.assert_equal(result, expected.astype("float64")) + var_x = series_data.expanding(min_periods=min_periods).var(ddof=ddof) + var_y = series_data.expanding(min_periods=min_periods).var(ddof=ddof) + cov_x_y = series_data.expanding(min_periods=min_periods).cov(series_data, ddof=ddof) + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + corr_x_y = series_data.expanding(min_periods=min_periods).corr(series_data) + std_x = series_data.expanding(min_periods=min_periods).std(ddof=ddof) + std_y = series_data.expanding(min_periods=min_periods).std(ddof=ddof) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + if ddof == 0: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_x = series_data.expanding(min_periods=min_periods).mean() + mean_y = series_data.expanding(min_periods=min_periods).mean() + mean_x_times_y = ( + (series_data * series_data).expanding(min_periods=min_periods).mean() + ) + tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) -def test_expanding_consistency_constant(consistency_data, min_periods): - x, is_constant, no_nans = consistency_data - if is_constant: - count_x = x.expanding().count() - mean_x = x.expanding(min_periods=min_periods).mean() - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = x.expanding(min_periods=min_periods).corr(x) +def test_expanding_consistency_mean(all_data, min_periods): + result = all_data.expanding(min_periods=min_periods).mean() + expected = ( + all_data.expanding(min_periods=min_periods).sum() + / all_data.expanding(min_periods=min_periods).count() + ) + tm.assert_equal(result, expected.astype("float64")) - exp = x.max() if isinstance(x, Series) else x.max().max() - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) +def test_expanding_consistency_constant(consistent_data, min_periods): + count_x = consistent_data.expanding().count() + mean_x = consistent_data.expanding(min_periods=min_periods).mean() + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = consistent_data.expanding(min_periods=min_periods).corr(consistent_data) + + exp = ( + consistent_data.max() + if isinstance(consistent_data, Series) + else consistent_data.max().max() + ) - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) + # check mean of constant series + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) -def test_expanding_consistency_var_debiasing_factors(consistency_data, min_periods): - x, is_constant, no_nans = consistency_data +def test_expanding_consistency_var_debiasing_factors(all_data, min_periods): # check variance debiasing factors - var_unbiased_x = x.expanding(min_periods=min_periods).var() - var_biased_x = x.expanding(min_periods=min_periods).var(ddof=0) - var_debiasing_factors_x = x.expanding().count() / ( - x.expanding().count() - 1.0 + var_unbiased_x = all_data.expanding(min_periods=min_periods).var() + var_biased_x = all_data.expanding(min_periods=min_periods).var(ddof=0) + var_debiasing_factors_x = all_data.expanding().count() / ( + all_data.expanding().count() - 1.0 ).replace(0.0, np.nan) tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index 49bc5af4e9d693..daca19b0993bf1 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -5,44 +5,52 @@ import pandas._testing as tm +def no_nans(x): + return x.notna().all().all() + + +def all_na(x): + return x.isnull().all().all() + + @pytest.mark.parametrize("f", [lambda v: Series(v).sum(), np.nansum, np.sum]) def test_rolling_apply_consistency_sum( - consistency_data, rolling_consistency_cases, center, f + request, all_data, rolling_consistency_cases, center, f ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases - if f is np.nansum and min_periods == 0: - pass - elif f is np.sum and not no_nans: - pass - else: - rolling_f_result = x.rolling( - window=window, min_periods=min_periods, center=center - ).sum() - rolling_apply_f_result = x.rolling( - window=window, min_periods=min_periods, center=center - ).apply(func=f, raw=True) - tm.assert_equal(rolling_f_result, rolling_apply_f_result) + if f is np.sum: + if not no_nans(all_data) and not ( + all_na(all_data) and not all_data.empty and min_periods > 0 + ): + request.node.add_marker( + pytest.mark.xfail(reason="np.sum has different behavior with NaNs") + ) + rolling_f_result = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).sum() + rolling_apply_f_result = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).apply(func=f, raw=True) + tm.assert_equal(rolling_f_result, rolling_apply_f_result) @pytest.mark.parametrize("ddof", [0, 1]) -def test_moments_consistency_var( - consistency_data, rolling_consistency_cases, center, ddof -): - x, is_constant, no_nans = consistency_data +def test_moments_consistency_var(all_data, rolling_consistency_cases, center, ddof): window, min_periods = rolling_consistency_cases - var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( + var_x = all_data.rolling(window=window, min_periods=min_periods, center=center).var( ddof=ddof ) assert not (var_x < 0).any().any() if ddof == 0: # check that biased var(x) == mean(x^2) - mean(x)^2 - mean_x = x.rolling(window=window, min_periods=min_periods, center=center).mean() + mean_x = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).mean() mean_x2 = ( - (x * x) + (all_data * all_data) .rolling(window=window, min_periods=min_periods, center=center) .mean() ) @@ -51,41 +59,38 @@ def test_moments_consistency_var( @pytest.mark.parametrize("ddof", [0, 1]) def test_moments_consistency_var_constant( - consistency_data, rolling_consistency_cases, center, ddof + consistent_data, rolling_consistency_cases, center, ddof ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases - if is_constant: - count_x = x.rolling( - window=window, min_periods=min_periods, center=center - ).count() - var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( - ddof=ddof - ) + count_x = consistent_data.rolling( + window=window, min_periods=min_periods, center=center + ).count() + var_x = consistent_data.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=ddof) - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if ddof == 1: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if ddof == 1: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) @pytest.mark.parametrize("ddof", [0, 1]) def test_rolling_consistency_var_std_cov( - consistency_data, rolling_consistency_cases, center, ddof + all_data, rolling_consistency_cases, center, ddof ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases - var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( + var_x = all_data.rolling(window=window, min_periods=min_periods, center=center).var( ddof=ddof ) assert not (var_x < 0).any().any() - std_x = x.rolling(window=window, min_periods=min_periods, center=center).std( + std_x = all_data.rolling(window=window, min_periods=min_periods, center=center).std( ddof=ddof ) assert not (std_x < 0).any().any() @@ -93,9 +98,9 @@ def test_rolling_consistency_var_std_cov( # check that var(x) == std(x)^2 tm.assert_equal(var_x, std_x * std_x) - cov_x_x = x.rolling(window=window, min_periods=min_periods, center=center).cov( - x, ddof=ddof - ) + cov_x_x = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).cov(all_data, ddof=ddof) assert not (cov_x_x < 0).any().any() # check that var(x) == cov(x, x) @@ -104,122 +109,128 @@ def test_rolling_consistency_var_std_cov( @pytest.mark.parametrize("ddof", [0, 1]) def test_rolling_consistency_series_cov_corr( - consistency_data, rolling_consistency_cases, center, ddof + series_data, rolling_consistency_cases, center, ddof ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases - if isinstance(x, Series): - var_x_plus_y = ( - (x + x) - .rolling(window=window, min_periods=min_periods, center=center) - .var(ddof=ddof) - ) - var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( - ddof=ddof - ) - var_y = x.rolling(window=window, min_periods=min_periods, center=center).var( - ddof=ddof - ) - cov_x_y = x.rolling(window=window, min_periods=min_periods, center=center).cov( - x, ddof=ddof - ) - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) + var_x_plus_y = ( + (series_data + series_data) + .rolling(window=window, min_periods=min_periods, center=center) + .var(ddof=ddof) + ) + var_x = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=ddof) + var_y = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=ddof) + cov_x_y = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).cov(series_data, ddof=ddof) + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + corr_x_y = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).corr(series_data) + std_x = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).std(ddof=ddof) + std_y = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).std(ddof=ddof) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - corr_x_y = x.rolling( + if ddof == 0: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_x = series_data.rolling( window=window, min_periods=min_periods, center=center - ).corr(x) - std_x = x.rolling(window=window, min_periods=min_periods, center=center).std( - ddof=ddof - ) - std_y = x.rolling(window=window, min_periods=min_periods, center=center).std( - ddof=ddof + ).mean() + mean_y = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + mean_x_times_y = ( + (series_data * series_data) + .rolling(window=window, min_periods=min_periods, center=center) + .mean() ) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if ddof == 0: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_x = x.rolling( - window=window, min_periods=min_periods, center=center - ).mean() - mean_y = x.rolling( - window=window, min_periods=min_periods, center=center - ).mean() - mean_x_times_y = ( - (x * x) - .rolling(window=window, min_periods=min_periods, center=center) - .mean() - ) - tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) + tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) -def test_rolling_consistency_mean(consistency_data, rolling_consistency_cases, center): - x, is_constant, no_nans = consistency_data +def test_rolling_consistency_mean(all_data, rolling_consistency_cases, center): window, min_periods = rolling_consistency_cases - result = x.rolling(window=window, min_periods=min_periods, center=center).mean() + result = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).mean() expected = ( - x.rolling(window=window, min_periods=min_periods, center=center) + all_data.rolling(window=window, min_periods=min_periods, center=center) .sum() .divide( - x.rolling(window=window, min_periods=min_periods, center=center).count() + all_data.rolling( + window=window, min_periods=min_periods, center=center + ).count() ) ) tm.assert_equal(result, expected.astype("float64")) def test_rolling_consistency_constant( - consistency_data, rolling_consistency_cases, center + consistent_data, rolling_consistency_cases, center ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases - if is_constant: - count_x = x.rolling( - window=window, min_periods=min_periods, center=center - ).count() - mean_x = x.rolling(window=window, min_periods=min_periods, center=center).mean() - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = x.rolling( - window=window, min_periods=min_periods, center=center - ).corr(x) + count_x = consistent_data.rolling( + window=window, min_periods=min_periods, center=center + ).count() + mean_x = consistent_data.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = consistent_data.rolling( + window=window, min_periods=min_periods, center=center + ).corr(consistent_data) - exp = x.max() if isinstance(x, Series) else x.max().max() + exp = ( + consistent_data.max() + if isinstance(consistent_data, Series) + else consistent_data.max().max() + ) - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) + # check mean of constant series + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) def test_rolling_consistency_var_debiasing_factors( - consistency_data, rolling_consistency_cases, center + all_data, rolling_consistency_cases, center ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases # check variance debiasing factors - var_unbiased_x = x.rolling( + var_unbiased_x = all_data.rolling( window=window, min_periods=min_periods, center=center ).var() - var_biased_x = x.rolling(window=window, min_periods=min_periods, center=center).var( - ddof=0 - ) + var_biased_x = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=0) var_debiasing_factors_x = ( - x.rolling(window=window, min_periods=min_periods, center=center) + all_data.rolling(window=window, min_periods=min_periods, center=center) .count() .divide( ( - x.rolling(window=window, min_periods=min_periods, center=center).count() + all_data.rolling( + window=window, min_periods=min_periods, center=center + ).count() - 1.0 ).replace(0.0, np.nan) ) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index c4efcd140baae4..6ec19e4899d533 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -122,8 +122,33 @@ def test_rolling_quantile(self, interpolation): expected.index = expected_index tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("f, expected_val", [["corr", 1], ["cov", 0.5]]) + def test_rolling_corr_cov_other_same_size_as_groups(self, f, expected_val): + # GH 42915 + df = DataFrame( + {"value": range(10), "idx1": [1] * 5 + [2] * 5, "idx2": [1, 2, 3, 4, 5] * 2} + ).set_index(["idx1", "idx2"]) + other = DataFrame({"value": range(5), "idx2": [1, 2, 3, 4, 5]}).set_index( + "idx2" + ) + result = getattr(df.groupby(level=0).rolling(2), f)(other) + expected_data = ([np.nan] + [expected_val] * 4) * 2 + expected = DataFrame( + expected_data, + columns=["value"], + index=MultiIndex.from_arrays( + [ + [1] * 5 + [2] * 5, + [1] * 5 + [2] * 5, + list(range(1, 6)) * 2, + ], + names=["idx1", "idx1", "idx2"], + ), + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("f", ["corr", "cov"]) - def test_rolling_corr_cov(self, f): + def test_rolling_corr_cov_other_diff_size_as_groups(self, f): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -138,6 +163,11 @@ def func(x): expected["A"] = np.nan tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("f", ["corr", "cov"]) + def test_rolling_corr_cov_pairwise(self, f): + g = self.frame.groupby("A") + r = g.rolling(window=4) + result = getattr(r.B, f)(pairwise=True) def func(x): @@ -838,6 +868,33 @@ def test_as_index_false(self, by, expected_data): ) tm.assert_frame_equal(result, expected) + def test_nan_and_zero_endpoints(self): + # https://github.com/twosigma/pandas/issues/53 + size = 1000 + idx = np.repeat(0, size) + idx[-1] = 1 + + val = 5e25 + arr = np.repeat(val, size) + arr[0] = np.nan + arr[-1] = 0 + + df = DataFrame( + { + "index": idx, + "adl2": arr, + } + ).set_index("index") + result = df.groupby("index")["adl2"].rolling(window=10, min_periods=1).mean() + expected = Series( + arr, + name="adl2", + index=MultiIndex.from_arrays( + [[0] * 999 + [1], [0] * 999 + [1]], names=["index", "index"] + ), + ) + tm.assert_series_equal(result, expected) + class TestExpanding: def setup_method(self): diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 0e3a3f3fb6c18a..10b09cbc34443a 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -6,7 +6,7 @@ ) import warnings -from dateutil.relativedelta import ( # noqa +from dateutil.relativedelta import ( # noqa:F401 FR, MO, SA,