diff --git a/.circleci/config.yml b/.circleci/config.yml index b6a5a00429d9a..1c4f33925c999 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -56,7 +56,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir export PANDAS_CI=1 diff --git a/.devcontainer.json b/.devcontainer.json index 7c5d009260c64..54ddfa1a130f8 100644 --- a/.devcontainer.json +++ b/.devcontainer.json @@ -8,7 +8,6 @@ // Use 'settings' to set *default* container specific settings.json values on container create. // You can edit these settings after create using File > Preferences > Settings > Remote. "settings": { - "terminal.integrated.shell.linux": "/bin/bash", "python.pythonPath": "/usr/local/bin/python", "python.formatting.provider": "black", "python.linting.enabled": true, diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index ceeebfcd1c90c..3eb68bdd2a15c 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -14,3 +14,9 @@ runs: condarc-file: ci/.condarc cache-environment: true cache-downloads: true + + - name: Uninstall pyarrow + if: ${{ env.REMOVE_PYARROW == '1' }} + run: | + micromamba remove -y pyarrow + shell: bash -el {0} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a085d0265a1a5..68b7573f01501 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -29,6 +29,7 @@ jobs: env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] + pandas_future_infer_string: ["0"] include: - name: "Downstream Compat" env_file: actions-311-downstream_compat.yaml @@ -58,6 +59,9 @@ jobs: # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" - name: "Future infer strings" + env_file: actions-312.yaml + pandas_future_infer_string: "1" + - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml pandas_future_infer_string: "1" - name: "Pypy" @@ -85,9 +89,10 @@ jobs: NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} # Clipboard tests QT_QPA_PLATFORM: offscreen + REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }} cancel-in-progress: true services: @@ -231,7 +236,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -269,7 +274,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir @@ -290,7 +295,7 @@ jobs: # In general, this will remain frozen(present, but not running) until: # - The next unreleased Python version has released beta 1 # - This version should be available on GitHub Actions. - # - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) + # - Our required build/runtime dependencies(numpy, Cython, python-dateutil) # support that unreleased Python version. # To unfreeze, comment out the ``if: false`` condition, and make sure you update # the name of the workflow and Python version in actions/setup-python ``python-version:`` @@ -343,7 +348,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" python -m pip list diff --git a/.gitpod.yml b/.gitpod.yml index 9222639136a17..9ff349747a33e 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -14,7 +14,7 @@ tasks: cp gitpod/settings.json .vscode/settings.json git fetch --tags python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true - pre-commit install + pre-commit install --install-hooks command: | python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true echo "✨ Pre-build complete! You can close this terminal ✨ " diff --git a/Dockerfile b/Dockerfile index 0fcbcee92295c..dead3a494e52d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.10.8 WORKDIR /home/pandas RUN apt-get update && apt-get -y upgrade -RUN apt-get install -y build-essential +RUN apt-get install -y build-essential bash-completion # hdf5 needed for pytables installation # libgles2-mesa needed for pytest-qt @@ -12,4 +12,6 @@ RUN python -m pip install --upgrade pip COPY requirements-dev.txt /tmp RUN python -m pip install -r /tmp/requirements-dev.txt RUN git config --global --add safe.directory /home/pandas + +ENV SHELL "/bin/bash" CMD ["/bin/bash"] diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 973e31815cf63..a9a4daa2e2059 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,20 +70,15 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.MultiIndex.names SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ - -i "pandas.MultiIndex.sortlevel PR07,SA01" \ -i "pandas.MultiIndex.to_frame RT03" \ -i "pandas.NA SA01" \ -i "pandas.NaT SA01" \ - -i "pandas.Period.asfreq SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.freqstr SA01" \ - -i "pandas.Period.month SA01" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.Period.strftime PR01,SA01" \ -i "pandas.Period.to_timestamp SA01" \ - -i "pandas.Period.year SA01" \ -i "pandas.PeriodDtype SA01" \ -i "pandas.PeriodDtype.freq SA01" \ -i "pandas.PeriodIndex.day SA01" \ @@ -158,28 +153,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.sparse.sp_values SA01" \ -i "pandas.Series.sparse.to_coo PR07,RT03,SA01" \ -i "pandas.Series.std PR01,RT03,SA01" \ - -i "pandas.Series.str.capitalize RT03" \ - -i "pandas.Series.str.casefold RT03" \ - -i "pandas.Series.str.center RT03,SA01" \ - -i "pandas.Series.str.decode PR07,RT03,SA01" \ - -i "pandas.Series.str.encode PR07,RT03,SA01" \ - -i "pandas.Series.str.index RT03" \ - -i "pandas.Series.str.ljust RT03,SA01" \ - -i "pandas.Series.str.lower RT03" \ - -i "pandas.Series.str.lstrip RT03" \ -i "pandas.Series.str.match RT03" \ -i "pandas.Series.str.normalize RT03,SA01" \ - -i "pandas.Series.str.partition RT03" \ -i "pandas.Series.str.repeat SA01" \ -i "pandas.Series.str.replace SA01" \ - -i "pandas.Series.str.rindex RT03" \ - -i "pandas.Series.str.rjust RT03,SA01" \ - -i "pandas.Series.str.rpartition RT03" \ - -i "pandas.Series.str.rstrip RT03" \ - -i "pandas.Series.str.strip RT03" \ - -i "pandas.Series.str.swapcase RT03" \ - -i "pandas.Series.str.title RT03" \ - -i "pandas.Series.str.upper RT03" \ -i "pandas.Series.str.wrap RT03,SA01" \ -i "pandas.Series.str.zfill RT03" \ -i "pandas.Series.struct.dtypes SA01" \ @@ -229,13 +206,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.to_julian_date SA01" \ -i "pandas.Timestamp.today SA01" \ -i "pandas.Timestamp.toordinal SA01" \ - -i "pandas.Timestamp.tz_localize SA01" \ -i "pandas.Timestamp.tzinfo GL08" \ - -i "pandas.Timestamp.tzname SA01" \ - -i "pandas.Timestamp.unit SA01" \ - -i "pandas.Timestamp.utcfromtimestamp PR01,SA01" \ - -i "pandas.Timestamp.utcoffset SA01" \ - -i "pandas.Timestamp.utctimetuple SA01" \ -i "pandas.Timestamp.value GL08" \ -i "pandas.Timestamp.year GL08" \ -i "pandas.api.extensions.ExtensionArray._pad_or_backfill PR01,RT03,SA01" \ @@ -259,7 +230,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.extensions.ExtensionArray.view SA01" \ -i "pandas.api.interchange.from_dataframe RT03,SA01" \ -i "pandas.api.types.is_bool PR01,SA01" \ - -i "pandas.api.types.is_bool_dtype SA01" \ -i "pandas.api.types.is_categorical_dtype SA01" \ -i "pandas.api.types.is_complex PR01,SA01" \ -i "pandas.api.types.is_complex_dtype SA01" \ @@ -421,156 +391,103 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ - -i "pandas.tseries.offsets.BQuarterBegin PR02" \ - -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ - -i "pandas.tseries.offsets.BQuarterBegin.nanos GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.normalize GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.rule_code GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.startingMonth GL08" \ - -i "pandas.tseries.offsets.BQuarterEnd.freqstr SA01" \ -i "pandas.tseries.offsets.BQuarterEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.n GL08" \ - -i "pandas.tseries.offsets.BQuarterEnd.nanos GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.normalize GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.rule_code GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.startingMonth GL08" \ - -i "pandas.tseries.offsets.BYearBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BYearBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BYearBegin.month GL08" \ -i "pandas.tseries.offsets.BYearBegin.n GL08" \ - -i "pandas.tseries.offsets.BYearBegin.nanos GL08" \ -i "pandas.tseries.offsets.BYearBegin.normalize GL08" \ - -i "pandas.tseries.offsets.BYearBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.BYearEnd PR02" \ - -i "pandas.tseries.offsets.BYearEnd.freqstr SA01" \ -i "pandas.tseries.offsets.BYearEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.BYearEnd.month GL08" \ -i "pandas.tseries.offsets.BYearEnd.n GL08" \ - -i "pandas.tseries.offsets.BYearEnd.nanos GL08" \ -i "pandas.tseries.offsets.BYearEnd.normalize GL08" \ - -i "pandas.tseries.offsets.BYearEnd.rule_code GL08" \ -i "pandas.tseries.offsets.BusinessDay PR02,SA01" \ -i "pandas.tseries.offsets.BusinessDay.calendar GL08" \ - -i "pandas.tseries.offsets.BusinessDay.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessDay.holidays GL08" \ -i "pandas.tseries.offsets.BusinessDay.is_on_offset GL08" \ -i "pandas.tseries.offsets.BusinessDay.n GL08" \ - -i "pandas.tseries.offsets.BusinessDay.nanos GL08" \ -i "pandas.tseries.offsets.BusinessDay.normalize GL08" \ - -i "pandas.tseries.offsets.BusinessDay.rule_code GL08" \ -i "pandas.tseries.offsets.BusinessDay.weekmask GL08" \ -i "pandas.tseries.offsets.BusinessHour PR02,SA01" \ -i "pandas.tseries.offsets.BusinessHour.calendar GL08" \ -i "pandas.tseries.offsets.BusinessHour.end GL08" \ - -i "pandas.tseries.offsets.BusinessHour.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessHour.holidays GL08" \ -i "pandas.tseries.offsets.BusinessHour.is_on_offset GL08" \ -i "pandas.tseries.offsets.BusinessHour.n GL08" \ - -i "pandas.tseries.offsets.BusinessHour.nanos GL08" \ -i "pandas.tseries.offsets.BusinessHour.normalize GL08" \ - -i "pandas.tseries.offsets.BusinessHour.rule_code GL08" \ -i "pandas.tseries.offsets.BusinessHour.start GL08" \ -i "pandas.tseries.offsets.BusinessHour.weekmask GL08" \ - -i "pandas.tseries.offsets.BusinessMonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessMonthBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BusinessMonthBegin.n GL08" \ - -i "pandas.tseries.offsets.BusinessMonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.BusinessMonthBegin.normalize GL08" \ - -i "pandas.tseries.offsets.BusinessMonthBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.BusinessMonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessMonthEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.BusinessMonthEnd.n GL08" \ - -i "pandas.tseries.offsets.BusinessMonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.BusinessMonthEnd.normalize GL08" \ - -i "pandas.tseries.offsets.BusinessMonthEnd.rule_code GL08" \ -i "pandas.tseries.offsets.CBMonthBegin PR02" \ -i "pandas.tseries.offsets.CBMonthEnd PR02" \ -i "pandas.tseries.offsets.CDay PR02,SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay PR02,SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay.calendar GL08" \ - -i "pandas.tseries.offsets.CustomBusinessDay.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessDay.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.is_on_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessDay.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.normalize GL08" \ - -i "pandas.tseries.offsets.CustomBusinessDay.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessDay.weekmask GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour PR02,SA01" \ -i "pandas.tseries.offsets.CustomBusinessHour.calendar GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.end GL08" \ - -i "pandas.tseries.offsets.CustomBusinessHour.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessHour.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.is_on_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessHour.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.normalize GL08" \ - -i "pandas.tseries.offsets.CustomBusinessHour.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.start GL08" \ -i "pandas.tseries.offsets.CustomBusinessHour.weekmask GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthBegin.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthBegin.weekmask GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset SA01" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \ - -i "pandas.tseries.offsets.CustomBusinessMonthEnd.rule_code GL08" \ -i "pandas.tseries.offsets.CustomBusinessMonthEnd.weekmask GL08" \ - -i "pandas.tseries.offsets.DateOffset PR02" \ - -i "pandas.tseries.offsets.DateOffset.freqstr SA01" \ -i "pandas.tseries.offsets.DateOffset.is_on_offset GL08" \ -i "pandas.tseries.offsets.DateOffset.n GL08" \ - -i "pandas.tseries.offsets.DateOffset.nanos GL08" \ -i "pandas.tseries.offsets.DateOffset.normalize GL08" \ - -i "pandas.tseries.offsets.DateOffset.rule_code GL08" \ - -i "pandas.tseries.offsets.Day.freqstr SA01" \ -i "pandas.tseries.offsets.Day.is_on_offset GL08" \ -i "pandas.tseries.offsets.Day.n GL08" \ - -i "pandas.tseries.offsets.Day.nanos SA01" \ -i "pandas.tseries.offsets.Day.normalize GL08" \ - -i "pandas.tseries.offsets.Day.rule_code GL08" \ - -i "pandas.tseries.offsets.Easter PR02" \ - -i "pandas.tseries.offsets.Easter.freqstr SA01" \ -i "pandas.tseries.offsets.Easter.is_on_offset GL08" \ -i "pandas.tseries.offsets.Easter.n GL08" \ - -i "pandas.tseries.offsets.Easter.nanos GL08" \ -i "pandas.tseries.offsets.Easter.normalize GL08" \ - -i "pandas.tseries.offsets.Easter.rule_code GL08" \ - -i "pandas.tseries.offsets.FY5253 PR02" \ - -i "pandas.tseries.offsets.FY5253.freqstr SA01" \ -i "pandas.tseries.offsets.FY5253.get_rule_code_suffix GL08" \ -i "pandas.tseries.offsets.FY5253.get_year_end GL08" \ -i "pandas.tseries.offsets.FY5253.is_on_offset GL08" \ -i "pandas.tseries.offsets.FY5253.n GL08" \ - -i "pandas.tseries.offsets.FY5253.nanos GL08" \ -i "pandas.tseries.offsets.FY5253.normalize GL08" \ -i "pandas.tseries.offsets.FY5253.rule_code GL08" \ -i "pandas.tseries.offsets.FY5253.startingMonth GL08" \ -i "pandas.tseries.offsets.FY5253.variation GL08" \ -i "pandas.tseries.offsets.FY5253.weekday GL08" \ - -i "pandas.tseries.offsets.FY5253Quarter PR02" \ - -i "pandas.tseries.offsets.FY5253Quarter.freqstr SA01" \ -i "pandas.tseries.offsets.FY5253Quarter.get_rule_code_suffix GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.get_weeks GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.is_on_offset GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.n GL08" \ - -i "pandas.tseries.offsets.FY5253Quarter.nanos GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.normalize GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.qtr_with_extra_week GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.rule_code GL08" \ @@ -578,139 +495,80 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.FY5253Quarter.variation GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.weekday GL08" \ -i "pandas.tseries.offsets.FY5253Quarter.year_has_extra_week GL08" \ - -i "pandas.tseries.offsets.Hour PR02" \ - -i "pandas.tseries.offsets.Hour.freqstr SA01" \ -i "pandas.tseries.offsets.Hour.is_on_offset GL08" \ -i "pandas.tseries.offsets.Hour.n GL08" \ - -i "pandas.tseries.offsets.Hour.nanos SA01" \ -i "pandas.tseries.offsets.Hour.normalize GL08" \ - -i "pandas.tseries.offsets.Hour.rule_code GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth PR02,SA01" \ - -i "pandas.tseries.offsets.LastWeekOfMonth.freqstr SA01" \ + -i "pandas.tseries.offsets.LastWeekOfMonth SA01" \ -i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth.nanos GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \ - -i "pandas.tseries.offsets.LastWeekOfMonth.rule_code GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.week GL08" \ -i "pandas.tseries.offsets.LastWeekOfMonth.weekday GL08" \ - -i "pandas.tseries.offsets.Micro PR02" \ - -i "pandas.tseries.offsets.Micro.freqstr SA01" \ -i "pandas.tseries.offsets.Micro.is_on_offset GL08" \ -i "pandas.tseries.offsets.Micro.n GL08" \ - -i "pandas.tseries.offsets.Micro.nanos SA01" \ -i "pandas.tseries.offsets.Micro.normalize GL08" \ - -i "pandas.tseries.offsets.Micro.rule_code GL08" \ - -i "pandas.tseries.offsets.Milli PR02" \ - -i "pandas.tseries.offsets.Milli.freqstr SA01" \ -i "pandas.tseries.offsets.Milli.is_on_offset GL08" \ -i "pandas.tseries.offsets.Milli.n GL08" \ - -i "pandas.tseries.offsets.Milli.nanos SA01" \ -i "pandas.tseries.offsets.Milli.normalize GL08" \ - -i "pandas.tseries.offsets.Milli.rule_code GL08" \ - -i "pandas.tseries.offsets.Minute PR02" \ - -i "pandas.tseries.offsets.Minute.freqstr SA01" \ -i "pandas.tseries.offsets.Minute.is_on_offset GL08" \ -i "pandas.tseries.offsets.Minute.n GL08" \ - -i "pandas.tseries.offsets.Minute.nanos SA01" \ -i "pandas.tseries.offsets.Minute.normalize GL08" \ - -i "pandas.tseries.offsets.Minute.rule_code GL08" \ - -i "pandas.tseries.offsets.MonthBegin PR02" \ - -i "pandas.tseries.offsets.MonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.MonthBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.MonthBegin.n GL08" \ - -i "pandas.tseries.offsets.MonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.MonthBegin.normalize GL08" \ - -i "pandas.tseries.offsets.MonthBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.MonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.MonthEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.MonthEnd.n GL08" \ - -i "pandas.tseries.offsets.MonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.MonthEnd.normalize GL08" \ - -i "pandas.tseries.offsets.MonthEnd.rule_code GL08" \ - -i "pandas.tseries.offsets.Nano PR02" \ - -i "pandas.tseries.offsets.Nano.freqstr SA01" \ -i "pandas.tseries.offsets.Nano.is_on_offset GL08" \ - -i "pandas.tseries.offsets.Nano.n GL08" \ - -i "pandas.tseries.offsets.Nano.nanos SA01" \ -i "pandas.tseries.offsets.Nano.normalize GL08" \ - -i "pandas.tseries.offsets.Nano.rule_code GL08" \ - -i "pandas.tseries.offsets.QuarterBegin PR02" \ - -i "pandas.tseries.offsets.QuarterBegin.freqstr SA01" \ + -i "pandas.tseries.offsets.Nano.n GL08" \ -i "pandas.tseries.offsets.QuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.QuarterBegin.n GL08" \ - -i "pandas.tseries.offsets.QuarterBegin.nanos GL08" \ -i "pandas.tseries.offsets.QuarterBegin.normalize GL08" \ -i "pandas.tseries.offsets.QuarterBegin.rule_code GL08" \ -i "pandas.tseries.offsets.QuarterBegin.startingMonth GL08" \ - -i "pandas.tseries.offsets.QuarterEnd.freqstr SA01" \ -i "pandas.tseries.offsets.QuarterEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.QuarterEnd.n GL08" \ - -i "pandas.tseries.offsets.QuarterEnd.nanos GL08" \ -i "pandas.tseries.offsets.QuarterEnd.normalize GL08" \ -i "pandas.tseries.offsets.QuarterEnd.rule_code GL08" \ -i "pandas.tseries.offsets.QuarterEnd.startingMonth GL08" \ - -i "pandas.tseries.offsets.Second PR02" \ - -i "pandas.tseries.offsets.Second.freqstr SA01" \ -i "pandas.tseries.offsets.Second.is_on_offset GL08" \ -i "pandas.tseries.offsets.Second.n GL08" \ - -i "pandas.tseries.offsets.Second.nanos SA01" \ -i "pandas.tseries.offsets.Second.normalize GL08" \ - -i "pandas.tseries.offsets.Second.rule_code GL08" \ - -i "pandas.tseries.offsets.SemiMonthBegin PR02,SA01" \ + -i "pandas.tseries.offsets.SemiMonthBegin SA01" \ -i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \ - -i "pandas.tseries.offsets.SemiMonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \ - -i "pandas.tseries.offsets.SemiMonthBegin.nanos GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \ - -i "pandas.tseries.offsets.SemiMonthEnd.freqstr SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \ - -i "pandas.tseries.offsets.SemiMonthEnd.nanos GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.normalize GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.rule_code GL08" \ -i "pandas.tseries.offsets.Tick GL08" \ - -i "pandas.tseries.offsets.Tick.freqstr SA01" \ -i "pandas.tseries.offsets.Tick.is_on_offset GL08" \ -i "pandas.tseries.offsets.Tick.n GL08" \ - -i "pandas.tseries.offsets.Tick.nanos SA01" \ -i "pandas.tseries.offsets.Tick.normalize GL08" \ - -i "pandas.tseries.offsets.Tick.rule_code GL08" \ - -i "pandas.tseries.offsets.Week PR02" \ - -i "pandas.tseries.offsets.Week.freqstr SA01" \ -i "pandas.tseries.offsets.Week.is_on_offset GL08" \ -i "pandas.tseries.offsets.Week.n GL08" \ - -i "pandas.tseries.offsets.Week.nanos GL08" \ -i "pandas.tseries.offsets.Week.normalize GL08" \ - -i "pandas.tseries.offsets.Week.rule_code GL08" \ -i "pandas.tseries.offsets.Week.weekday GL08" \ - -i "pandas.tseries.offsets.WeekOfMonth PR02,SA01" \ - -i "pandas.tseries.offsets.WeekOfMonth.freqstr SA01" \ + -i "pandas.tseries.offsets.WeekOfMonth SA01" \ -i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.n GL08" \ - -i "pandas.tseries.offsets.WeekOfMonth.nanos GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \ - -i "pandas.tseries.offsets.WeekOfMonth.rule_code GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.week GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.weekday GL08" \ - -i "pandas.tseries.offsets.YearBegin.freqstr SA01" \ -i "pandas.tseries.offsets.YearBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.YearBegin.month GL08" \ -i "pandas.tseries.offsets.YearBegin.n GL08" \ - -i "pandas.tseries.offsets.YearBegin.nanos GL08" \ -i "pandas.tseries.offsets.YearBegin.normalize GL08" \ - -i "pandas.tseries.offsets.YearBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.YearEnd.freqstr SA01" \ -i "pandas.tseries.offsets.YearEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.YearEnd.month GL08" \ -i "pandas.tseries.offsets.YearEnd.n GL08" \ - -i "pandas.tseries.offsets.YearEnd.nanos GL08" \ -i "pandas.tseries.offsets.YearEnd.normalize GL08" \ - -i "pandas.tseries.offsets.YearEnd.rule_code GL08" \ -i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 0c46f476893dd..e670356c95637 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -23,7 +23,6 @@ dependencies: # required dependencies - python-dateutil=2.8.2 - numpy=1.23.5 - - pytz=2020.1 # optional dependencies - beautifulsoup4=4.11.2 @@ -49,6 +48,7 @@ dependencies: - pyreadstat=1.2.0 - pytables=3.8.0 - python-calamine=0.1.7 + - pytz=2023.4 - pyxlsb=1.0.10 - s3fs=2022.11.0 - scipy=1.10.0 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 0af46752f5b3d..c33c0344e742f 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -21,7 +21,6 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -47,6 +46,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 1a842c7212c1f..8692b6e35ab2d 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -22,7 +22,6 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -48,6 +47,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 748cfa861ec32..996ce5cd9ab94 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -18,7 +18,6 @@ dependencies: # pandas dependencies - python-dateutil - - pytz - pip - pip: diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 469fb1bfb9138..434f1d4f7fed2 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -19,7 +19,6 @@ dependencies: # required dependencies - python-dateutil - numpy<2 - - pytz - pip - pip: diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 75394e2c8e109..8e7d9aba7878d 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -21,7 +21,6 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -47,6 +46,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index d4b43ddef3601..6c97960a62d40 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -21,7 +21,6 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -47,6 +46,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index b0ae9f1e48473..c157d2e65c001 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -22,6 +22,5 @@ dependencies: # required - numpy - python-dateutil - - pytz - pip: - tzdata>=2022.7 diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml index 18535d81e6985..c86534871b3d2 100644 --- a/ci/deps/circle-311-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -21,7 +21,6 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -47,6 +46,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/ci/meta.yaml b/ci/meta.yaml index b76bef2f630b7..9d434991b12c1 100644 --- a/ci/meta.yaml +++ b/ci/meta.yaml @@ -37,7 +37,6 @@ requirements: - numpy >=1.21.6 # [py<311] - numpy >=1.23.2 # [py>=311] - python-dateutil >=2.8.2 - - pytz >=2020.1 - python-tzdata >=2022.7 test: diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 28129440b86d7..277f407ae4418 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -762,8 +762,7 @@ install pandas) by typing:: your installation is probably fine and you can start contributing! Often it is worth running only a subset of tests first around your changes before running the -entire suite (tip: you can use the `pandas-coverage app `_) -to find out which tests hit the lines of code you've modified, and then run only those). +entire suite. The easiest way to do this is with:: diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 0b8c1e16dce0e..e174eea00ca60 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -142,7 +142,7 @@ backticks. The following are considered inline code: With several mistakes in the docstring. - It has a blank like after the signature ``def func():``. + It has a blank line after the signature ``def func():``. The text 'Some function' should go in the line after the opening quotes of the docstring, not in the same line. diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 86ce05fde547b..8e6cb9e9a132d 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -205,7 +205,6 @@ Package Minimum support ================================================================ ========================== `NumPy `__ 1.23.5 `python-dateutil `__ 2.8.2 -`pytz `__ 2020.1 `tzdata `__ 2022.7 ================================================================ ========================== @@ -419,3 +418,14 @@ Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= Zstandard 0.19.0 compression Zstandard compression ========================= ================== =============== ============================================================= + +Timezone +^^^^^^^^ + +Installable with ``pip install "pandas[timezone]"`` + +========================= ================== =================== ============================================================= +Dependency Minimum Version pip extra Notes +========================= ================== =================== ============================================================= +pytz 2023.4 timezone Alternative timezone library to ``zoneinfo``. +========================= ================== =================== ============================================================= diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 0845417e4910d..4299dca4774b9 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -2569,7 +2569,7 @@ Ambiguous times when localizing because daylight savings time (DST) in a local time zone causes some times to occur twice within one day ("clocks fall back"). The following options are available: -* ``'raise'``: Raises a ``pytz.AmbiguousTimeError`` (the default behavior) +* ``'raise'``: Raises a ``ValueError`` (the default behavior) * ``'infer'``: Attempt to determine the correct offset base on the monotonicity of the timestamps * ``'NaT'``: Replaces ambiguous times with ``NaT`` * ``bool``: ``True`` represents a DST time, ``False`` represents non-DST time. An array-like of ``bool`` values is supported for a sequence of times. @@ -2604,7 +2604,7 @@ A DST transition may also shift the local time ahead by 1 hour creating nonexist local times ("clocks spring forward"). The behavior of localizing a timeseries with nonexistent times can be controlled by the ``nonexistent`` argument. The following options are available: -* ``'raise'``: Raises a ``pytz.NonExistentTimeError`` (the default behavior) +* ``'raise'``: Raises a ``ValueError`` (the default behavior) * ``'NaT'``: Replaces nonexistent times with ``NaT`` * ``'shift_forward'``: Shifts nonexistent times forward to the closest real time * ``'shift_backward'``: Shifts nonexistent times backward to the closest real time diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3de65fe6f682c..f25edd39cf7da 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -50,8 +50,10 @@ Other enhancements - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) +- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) @@ -220,6 +222,8 @@ Optional libraries below the lowest tested version may still work, but are not c +------------------------+---------------------+ | Package | New Minimum Version | +========================+=====================+ +| pytz | 2023.4 | ++------------------------+---------------------+ | fastparquet | 2023.10.0 | +------------------------+---------------------+ | adbc-driver-postgresql | 0.10.0 | @@ -229,6 +233,37 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. +.. _whatsnew_300.api_breaking.pytz: + +``pytz`` now an optional dependency +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas now uses :py:mod:`zoneinfo` from the standard library as the default timezone implementation when passing a timezone +string to various methods. (:issue:`34916`) + +*Old behavior:* + +.. code-block:: ipython + + In [1]: ts = pd.Timestamp(2024, 1, 1).tz_localize("US/Pacific") + In [2]: ts.tz + + +*New behavior:* + +.. ipython:: python + + ts = pd.Timestamp(2024, 1, 1).tz_localize("US/Pacific") + ts.tz + +``pytz`` timezone objects are still supported when passed directly, but they will no longer be returned by default +from string inputs. Moreover, ``pytz`` is no longer a required dependency of pandas, but can be installed +with the pip extra ``pip install pandas[timezone]``. + + +Additionally, pandas no longer throws ``pytz`` exceptions for timezone operations leading to ambiguous or nonexistent +times. These cases will now raise a ``ValueError``. + .. _whatsnew_300.api_breaking.other: Other API changes @@ -618,7 +653,9 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ +- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) +- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) @@ -631,6 +668,7 @@ ExtensionArray ^^^^^^^^^^^^^^ - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) +- Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) - Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) Styler diff --git a/environment.yml b/environment.yml index e5646af07c45c..34bc0591ca8df 100644 --- a/environment.yml +++ b/environment.yml @@ -24,7 +24,6 @@ dependencies: # required dependencies - python-dateutil - numpy<2 - - pytz # optional dependencies - beautifulsoup4>=4.11.2 @@ -50,6 +49,7 @@ dependencies: - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.7 + - pytz>=2023.4 - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 diff --git a/pandas/__init__.py b/pandas/__init__.py index 3ee6f6abf97bf..05547e50bbb37 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -3,7 +3,7 @@ __docformat__ = "restructuredtext" # Let users know if they're missing any of our hard dependencies -_hard_dependencies = ("numpy", "pytz", "dateutil") +_hard_dependencies = ("numpy", "dateutil") _missing_dependencies = [] for _dependency in _hard_dependencies: diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 51794ec04b29e..4ed2d4c3be692 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -426,6 +426,11 @@ def option_context(*args) -> Generator[None, None, None]: None No return value. + Yields + ------ + None + No yield value. + See Also -------- get_option : Retrieve the value of the specified option. diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 0fadbbbed2c72..a635dd33f8420 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -69,6 +69,7 @@ from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timezones cimport ( get_utcoffset, is_utc, + treat_tz_as_pytz, ) from pandas._libs.tslibs.tzconversion cimport ( Localizer, @@ -747,11 +748,17 @@ cdef datetime _localize_pydatetime(datetime dt, tzinfo tz): identically, i.e. discards nanos from Timestamps. It also assumes that the `tz` input is not None. """ - try: + if treat_tz_as_pytz(tz): + import pytz + # datetime.replace with pytz may be incorrect result # TODO: try to respect `fold` attribute - return tz.localize(dt, is_dst=None) - except AttributeError: + try: + return tz.localize(dt, is_dst=None) + except (pytz.AmbiguousTimeError, pytz.NonExistentTimeError) as err: + # As of pandas 3.0, we raise ValueErrors instead of pytz exceptions + raise ValueError(str(err)) from err + else: return dt.replace(tzinfo=tz) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 6ae5a96c428c2..3cb4dda1cd273 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -602,7 +602,24 @@ class NaTType(_NaT): utctimetuple = _make_error_func( "utctimetuple", """ - Return UTC time tuple, compatible with time.localtime(). + Return UTC time tuple, compatible with `time.localtime()`. + + This method converts the Timestamp to UTC and returns a time tuple + containing 9 components: year, month, day, hour, minute, second, + weekday, day of year, and DST flag. This is particularly useful for + converting a Timestamp to a format compatible with time module functions. + + Returns + ------- + time.struct_time + A time.struct_time object representing the UTC time. + + See Also + -------- + datetime.datetime.utctimetuple : + Return UTC time tuple, compatible with time.localtime(). + Timestamp.timetuple : Return time tuple of local time. + time.struct_time : Time tuple structure used by time functions. Examples -------- @@ -619,6 +636,22 @@ class NaTType(_NaT): """ Return utc offset. + This method returns the difference between UTC and the local time + as a `timedelta` object. It is useful for understanding the time + difference between the current timezone and UTC. + + Returns + -------- + timedelta + The difference between UTC and the local time as a `timedelta` object. + + See Also + -------- + datetime.datetime.utcoffset : + Standard library method to get the UTC offset of a datetime object. + Timestamp.tzname : Return the name of the timezone. + Timestamp.dst : Return the daylight saving time (DST) adjustment. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -633,6 +666,13 @@ class NaTType(_NaT): """ Return time zone name. + This method returns the name of the Timestamp's time zone as a string. + + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -772,6 +812,21 @@ class NaTType(_NaT): Construct a timezone-aware UTC datetime from a POSIX timestamp. + This method creates a datetime object from a POSIX timestamp, keeping the + Timestamp object's timezone. + + Parameters + ---------- + ts : float + POSIX timestamp. + + See Also + -------- + Timezone.tzname : Return time zone name. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.fromtimestamp : Transform timestamp[, tz] to tz's local + time from POSIX timestamp. + Notes ----- Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp @@ -1052,9 +1107,9 @@ class NaTType(_NaT): * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1065,7 +1120,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -1153,9 +1208,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1166,7 +1221,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises @@ -1248,9 +1303,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1261,7 +1316,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises @@ -1412,9 +1467,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1427,7 +1482,7 @@ default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -1439,6 +1494,13 @@ default 'raise' TypeError If the Timestamp is tz-aware and tz is not None. + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a naive timestamp object: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 554c4f109f1c5..c48acc07b34db 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -595,6 +595,24 @@ cdef class BaseOffset: @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.rule_code : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.rule_code : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.offsets.Hour().rule_code + 'h' + + >>> pd.offsets.Week(5).rule_code + 'W' + """ return self._prefix @cache_readonly @@ -602,6 +620,17 @@ cdef class BaseOffset: """ Return a string representing the frequency. + See Also + -------- + tseries.offsets.BusinessDay.freqstr : + Return a string representing an offset frequency in Business Days. + tseries.offsets.BusinessHour.freqstr : + Return a string representing an offset frequency in Business Hours. + tseries.offsets.Week.freqstr : + Return a string representing an offset frequency in Weeks. + tseries.offsets.Hour.freqstr : + Return a string representing an offset frequency in Hours. + Examples -------- >>> pd.DateOffset(5).freqstr @@ -779,6 +808,26 @@ cdef class BaseOffset: @property def nanos(self): + """ + Returns a integer of the total number of nanoseconds for fixed frequencies. + + Raises + ------ + ValueError + If the frequency is non-fixed. + + See Also + -------- + tseries.offsets.Hour.nanos : + Returns an integer of the total number of nanoseconds. + tseries.offsets.Day.nanos : + Returns an integer of the total number of nanoseconds. + + Examples + -------- + >>> pd.offsets.Week(n=1).nanos + ValueError: Week: weekday=None is a non-fixed frequency + """ raise ValueError(f"{self} is a non-fixed frequency") # ------------------------------------------------------------------ @@ -986,12 +1035,14 @@ cdef class Tick(SingleConstructorOffset): @property def nanos(self) -> int64_t: """ - Return an integer of the total number of nanoseconds. + Returns an integer of the total number of nanoseconds. - Raises - ------ - ValueError - If the frequency is non-fixed. + See Also + -------- + tseries.offsets.Hour.nanos : + Returns an integer of the total number of nanoseconds. + tseries.offsets.Day.nanos : + Returns an integer of the total number of nanoseconds. Examples -------- @@ -1147,7 +1198,7 @@ cdef class Hour(Tick): """ Offset ``n`` hours. - Parameters + Attributes ---------- n : int, default 1 The number of hours represented. @@ -1183,7 +1234,7 @@ cdef class Minute(Tick): """ Offset ``n`` minutes. - Parameters + Attributes ---------- n : int, default 1 The number of minutes represented. @@ -1219,7 +1270,7 @@ cdef class Second(Tick): """ Offset ``n`` seconds. - Parameters + Attributes ---------- n : int, default 1 The number of seconds represented. @@ -1255,7 +1306,7 @@ cdef class Milli(Tick): """ Offset ``n`` milliseconds. - Parameters + Attributes ---------- n : int, default 1 The number of milliseconds represented. @@ -1292,7 +1343,7 @@ cdef class Micro(Tick): """ Offset ``n`` microseconds. - Parameters + Attributes ---------- n : int, default 1 The number of microseconds represented. @@ -1329,7 +1380,7 @@ cdef class Nano(Tick): """ Offset ``n`` nanoseconds. - Parameters + Attributes ---------- n : int, default 1 The number of nanoseconds represented. @@ -1616,7 +1667,7 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): Besides, adding a DateOffsets specified by the singular form of the date component can be used to replace certain component of the timestamp. - Parameters + Attributes ---------- n : int, default 1 The number of time periods the offset represents. @@ -2426,6 +2477,24 @@ cdef class WeekOfMonthMixin(SingleConstructorOffset): @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.rule_code : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.rule_code : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.offsets.Week(5).rule_code + 'W' + + >>> pd.offsets.WeekOfMonth(n=1, week=0, weekday=0).rule_code + 'WOM-1MON' + """ weekday = int_to_weekday.get(self.weekday, "") if self.week == -1: # LastWeekOfMonth @@ -2472,6 +2541,24 @@ cdef class YearOffset(SingleConstructorOffset): @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.rule_code : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.rule_code : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.tseries.offsets.YearBegin(n=1, month=2).rule_code + 'YS-FEB' + + >>> pd.tseries.offsets.YearEnd(n=1, month=6).rule_code + 'YE-JUN' + """ month = MONTH_ALIASES[self.month] return f"{self._prefix}-{month}" @@ -2506,7 +2593,7 @@ cdef class BYearEnd(YearOffset): """ DateOffset increments between the last business day of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -2804,7 +2891,7 @@ cdef class BQuarterBegin(QuarterOffset): startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ... startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2886,7 +2973,7 @@ cdef class QuarterBegin(QuarterOffset): startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ... startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2984,7 +3071,7 @@ cdef class MonthBegin(MonthOffset): MonthBegin goes to the next date which is a start of the month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3272,7 +3359,7 @@ cdef class SemiMonthBegin(SemiMonthOffset): """ Two DateOffset's per month repeating on the first day of the month & day_of_month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3304,7 +3391,7 @@ cdef class Week(SingleConstructorOffset): """ Weekly offset. - Parameters + Attributes ---------- n : int, default 1 The number of weeks represented. @@ -3458,6 +3545,24 @@ cdef class Week(SingleConstructorOffset): @property def rule_code(self) -> str: + """ + Return a string representing the base frequency. + + See Also + -------- + tseries.offsets.Hour.name : + Returns a string representing the base frequency of 'h'. + tseries.offsets.Day.name : + Returns a string representing the base frequency of 'D'. + + Examples + -------- + >>> pd.offsets.Hour().rule_code + 'h' + + >>> pd.offsets.Week(5).rule_code + 'W' + """ suffix = "" if self.weekday is not None: weekday = int_to_weekday[self.weekday] @@ -3477,7 +3582,7 @@ cdef class WeekOfMonth(WeekOfMonthMixin): """ Describes monthly dates like "the Tuesday of the 2nd week of each month". - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3554,7 +3659,7 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin): For example "the last Tuesday of each month". - Parameters + Attributes ---------- n : int, default 1 The number of months represented. @@ -3694,7 +3799,7 @@ cdef class FY5253(FY5253Mixin): X is a specific day of the week. Y is a certain month of the year - Parameters + Attributes ---------- n : int The number of fiscal years represented. @@ -3897,7 +4002,7 @@ cdef class FY5253Quarter(FY5253Mixin): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... - Parameters + Attributes ---------- n : int The number of business quarters represented. @@ -4132,7 +4237,7 @@ cdef class Easter(SingleConstructorOffset): Right now uses the revised method which is valid in years 1583-4099. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c6ba97fe9f1a2..4f5dfc75a20bf 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1913,20 +1913,58 @@ cdef class _Period(PeriodMixin): Parameters ---------- freq : str, BaseOffset - The desired frequency. If passing a `str`, it needs to be a - valid :ref:`period alias `. + The target frequency to convert the Period object to. + If a string is provided, + it must be a valid :ref:`period alias `. + how : {'E', 'S', 'end', 'start'}, default 'end' - Start or end of the timespan. + Specifies whether to align the period to the start or end of the interval: + - 'E' or 'end': Align to the end of the interval. + - 'S' or 'start': Align to the start of the interval. Returns ------- - resampled : Period + Period : Period object with the specified frequency, aligned to the parameter. + + See Also + -------- + Period.end_time : Return the end Timestamp. + Period.start_time : Return the start Timestamp. + Period.dayofyear : Return the day of the year. + Period.dayofweek : Return the day of the week. Examples -------- - >>> period = pd.Period('2023-1-1', freq='D') + Convert a daily period to an hourly period, aligning to the end of the day: + + >>> period = pd.Period('2023-01-01', freq='D') >>> period.asfreq('h') Period('2023-01-01 23:00', 'h') + + Convert a monthly period to a daily period, aligning to the start of the month: + + >>> period = pd.Period('2023-01', freq='M') + >>> period.asfreq('D', how='start') + Period('2023-01-01', 'D') + + Convert a yearly period to a monthly period, aligning to the last month: + + >>> period = pd.Period('2023', freq='Y') + >>> period.asfreq('M', how='end') + Period('2023-12', 'M') + + Convert a monthly period to an hourly period, + aligning to the first day of the month: + + >>> period = pd.Period('2023-01', freq='M') + >>> period.asfreq('h', how='start') + Period('2023-01-01 00:00', 'H') + + Convert a weekly period to a daily period, aligning to the last day of the week: + + >>> period = pd.Period('2023-08-01', freq='W') + >>> period.asfreq('D', how='end') + Period('2023-08-04', 'D') """ freq = self._maybe_convert_freq(freq) how = validate_end_alias(how) @@ -2000,11 +2038,44 @@ cdef class _Period(PeriodMixin): """ Return the year this Period falls on. + Returns + ------- + int + + See Also + -------- + period.month : Get the month of the year for the given Period. + period.day : Return the day of the month the Period falls on. + + Notes + ----- + The year is based on the `ordinal` and `base` attributes of the Period. + Examples -------- - >>> period = pd.Period('2022-01', 'M') + Create a Period object for January 2023 and get the year: + + >>> period = pd.Period('2023-01', 'M') >>> period.year - 2022 + 2023 + + Create a Period object for 01 January 2023 and get the year: + + >>> period = pd.Period('2023', 'D') + >>> period.year + 2023 + + Get the year for a period representing a quarter: + + >>> period = pd.Period('2023Q2', 'Q') + >>> period.year + 2023 + + Handle a case where the Period object is empty, which results in `NaN`: + + >>> period = pd.Period('nan', 'M') + >>> period.year + nan """ base = self._dtype._dtype_code return pyear(self.ordinal, base) @@ -2014,11 +2085,45 @@ cdef class _Period(PeriodMixin): """ Return the month this Period falls on. + Returns + ------- + int + + See Also + -------- + period.week : Get the week of the year on the given Period. + Period.year : Return the year this Period falls on. + Period.day : Return the day of the month this Period falls on. + + Notes + ----- + The month is based on the `ordinal` and `base` attributes of the Period. + Examples -------- + Create a Period object for January 2022 and get the month: + >>> period = pd.Period('2022-01', 'M') >>> period.month 1 + + Period object with no specified frequency, resulting in a default frequency: + + >>> period = pd.Period('2022', 'Y') + >>> period.month + 12 + + Create a Period object with a specified frequency but an incomplete date string: + + >>> period = pd.Period('2022', 'M') + >>> period.month + 1 + + Handle a case where the Period object is empty, which results in `NaN`: + + >>> period = pd.Period('nan', 'M') + >>> period.month + nan """ base = self._dtype._dtype_code return pmonth(self.ordinal, base) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 43279051e2a30..ccb1a1d6870f7 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -16,6 +16,7 @@ FUNCTIONS: strptime -- Calculates the time struct represented by the passed-in string """ from datetime import timezone +import zoneinfo from cpython.datetime cimport ( PyDate_Check, @@ -38,7 +39,6 @@ from _thread import allocate_lock as _thread_allocate_lock import re import numpy as np -import pytz cimport numpy as cnp from numpy cimport ( @@ -747,7 +747,7 @@ cdef tzinfo _parse_with_format( week_of_year_start = 0 elif parse_code == 17: # e.g. val='2011-12-30T00:00:00.000000UTC'; fmt='%Y-%m-%dT%H:%M:%S.%f%Z' - tz = pytz.timezone(found_dict["Z"]) + tz = zoneinfo.ZoneInfo(found_dict["Z"]) elif parse_code == 19: # e.g. val='March 1, 2018 12:00:00+0400'; fmt='%B %d, %Y %H:%M:%S%z' tz = parse_timezone_directive(found_dict["z"]) @@ -837,7 +837,7 @@ class TimeRE(_TimeRE): if key == "Z": # lazy computation if self._Z is None: - self._Z = self.__seqToRE(pytz.all_timezones, "Z") + self._Z = self.__seqToRE(zoneinfo.available_timezones(), "Z") # Note: handling Z is the key difference vs using the stdlib # _strptime.TimeRE. test_to_datetime_parse_tzname_or_tzoffset with # fmt='%Y-%m-%d %H:%M:%S %Z' fails with the stdlib version. diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 7cb9c852ea1e3..1cbb24084a62b 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -254,6 +254,28 @@ cdef class _Timestamp(ABCTimestamp): """ The abbreviation associated with self._creso. + This property returns a string representing the time unit of the Timestamp's + resolution. It corresponds to the smallest time unit that can be represented + by this Timestamp object. The possible values are: + - 's' (second) + - 'ms' (millisecond) + - 'us' (microsecond) + - 'ns' (nanosecond) + + Returns + ------- + str + A string abbreviation of the Timestamp's resolution unit: + - 's' for second + - 'ms' for millisecond + - 'us' for microsecond + - 'ns' for nanosecond + + See Also + -------- + Timestamp.resolution : Return resolution of the Timestamp. + Timedelta : A duration expressing the difference between two dates or times. + Examples -------- >>> pd.Timestamp("2020-01-01 12:34:56").unit @@ -1590,6 +1612,21 @@ class Timestamp(_Timestamp): Construct a timezone-aware UTC datetime from a POSIX timestamp. + This method creates a datetime object from a POSIX timestamp, keeping the + Timestamp object's timezone. + + Parameters + ---------- + ts : float + POSIX timestamp. + + See Also + -------- + Timezone.tzname : Return time zone name. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.fromtimestamp : Transform timestamp[, tz] to tz's local + time from POSIX timestamp. + Notes ----- Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp @@ -1765,6 +1802,13 @@ class Timestamp(_Timestamp): """ Return time zone name. + This method returns the name of the Timestamp's time zone as a string. + + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -1779,6 +1823,22 @@ class Timestamp(_Timestamp): """ Return utc offset. + This method returns the difference between UTC and the local time + as a `timedelta` object. It is useful for understanding the time + difference between the current timezone and UTC. + + Returns + -------- + timedelta + The difference between UTC and the local time as a `timedelta` object. + + See Also + -------- + datetime.datetime.utcoffset : + Standard library method to get the UTC offset of a datetime object. + Timestamp.tzname : Return the name of the timezone. + Timestamp.dst : Return the daylight saving time (DST) adjustment. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -1791,7 +1851,24 @@ class Timestamp(_Timestamp): def utctimetuple(self): """ - Return UTC time tuple, compatible with time.localtime(). + Return UTC time tuple, compatible with `time.localtime()`. + + This method converts the Timestamp to UTC and returns a time tuple + containing 9 components: year, month, day, hour, minute, second, + weekday, day of year, and DST flag. This is particularly useful for + converting a Timestamp to a format compatible with time module functions. + + Returns + ------- + time.struct_time + A time.struct_time object representing the UTC time. + + See Also + -------- + datetime.datetime.utctimetuple : + Return UTC time tuple, compatible with time.localtime(). + Timestamp.timetuple : Return time tuple of local time. + time.struct_time : Time tuple structure used by time functions. Examples -------- @@ -2134,9 +2211,9 @@ class Timestamp(_Timestamp): * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2147,7 +2224,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -2237,9 +2314,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2250,7 +2327,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises @@ -2332,9 +2409,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ + nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2345,7 +2422,7 @@ timedelta}, default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Raises @@ -2471,9 +2548,9 @@ timedelta}, default 'raise' * bool contains flags to determine if time is dst or not (note that this flag is only applicable for ambiguous fall dst dates). * 'NaT' will return NaT for an ambiguous time. - * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + * 'raise' will raise a ValueError for an ambiguous time. - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -2486,7 +2563,7 @@ default 'raise' closest existing time. * 'NaT' will return NaT where there are nonexistent times. * timedelta objects will shift nonexistent times by the timedelta. - * 'raise' will raise an NonExistentTimeError if there are + * 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -2498,6 +2575,13 @@ default 'raise' TypeError If the Timestamp is tz-aware and tz is not None. + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a naive timestamp object: diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 6292b6ce0fd1d..36b644ffc826d 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -2,17 +2,10 @@ from datetime import ( timedelta, timezone, ) +import zoneinfo from pandas.compat._optional import import_optional_dependency -try: - # py39+ - import zoneinfo - from zoneinfo import ZoneInfo -except ImportError: - zoneinfo = None - ZoneInfo = None - from cpython.datetime cimport ( datetime, timedelta, @@ -28,8 +21,8 @@ from dateutil.tz import ( tzutc as _dateutil_tzutc, ) import numpy as np -import pytz -from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo + +pytz = import_optional_dependency("pytz", errors="ignore") cimport numpy as cnp from numpy cimport int64_t @@ -45,10 +38,11 @@ from pandas._libs.tslibs.util cimport ( cdef int64_t NPY_NAT = get_nat() cdef tzinfo utc_stdlib = timezone.utc -cdef tzinfo utc_pytz = pytz.utc +cdef tzinfo utc_pytz = pytz.UTC if pytz else None cdef tzinfo utc_dateutil_str = dateutil_gettz("UTC") # NB: *not* the same as tzutc() cdef tzinfo utc_zoneinfo = None +cdef type ZoneInfo = zoneinfo.ZoneInfo # ---------------------------------------------------------------------- @@ -56,13 +50,13 @@ cdef tzinfo utc_zoneinfo = None cdef bint is_utc_zoneinfo(tzinfo tz): # Workaround for cases with missing tzdata # https://github.com/pandas-dev/pandas/pull/46425#discussion_r830633025 - if tz is None or zoneinfo is None: + if tz is None: return False global utc_zoneinfo if utc_zoneinfo is None: try: - utc_zoneinfo = ZoneInfo("UTC") + utc_zoneinfo = zoneinfo.ZoneInfo("UTC") except zoneinfo.ZoneInfoNotFoundError: return False # Warn if tzdata is too old, even if there is a system tzdata to alert @@ -74,17 +68,15 @@ cdef bint is_utc_zoneinfo(tzinfo tz): cpdef inline bint is_utc(tzinfo tz): return ( - tz is utc_pytz - or tz is utc_stdlib + tz is utc_stdlib or isinstance(tz, _dateutil_tzutc) or tz is utc_dateutil_str or is_utc_zoneinfo(tz) + or (utc_pytz is not None and tz is utc_pytz) ) cdef bint is_zoneinfo(tzinfo tz): - if ZoneInfo is None: - return False return isinstance(tz, ZoneInfo) @@ -166,7 +158,7 @@ cpdef inline tzinfo maybe_get_tz(object tz): elif tz == "UTC" or tz == "utc": tz = utc_stdlib else: - tz = pytz.timezone(tz) + tz = zoneinfo.ZoneInfo(tz) elif is_integer_object(tz): tz = timezone(timedelta(seconds=tz)) elif isinstance(tz, tzinfo): @@ -205,7 +197,7 @@ cdef object tz_cache_key(tzinfo tz): the same tz file). Also, pytz objects are not always hashable so we use str(tz) instead. """ - if isinstance(tz, _pytz_BaseTzInfo): + if pytz is not None and isinstance(tz, pytz.tzinfo.BaseTzInfo): return tz.zone elif isinstance(tz, _dateutil_tzfile): if ".tar.gz" in tz._filename: @@ -239,7 +231,7 @@ cpdef inline bint is_fixed_offset(tzinfo tz): return 1 else: return 0 - elif treat_tz_as_pytz(tz): + elif treat_tz_as_pytz(tz) and pytz is not None: if (len(tz._transition_info) == 0 and len(tz._utc_transition_times) == 0): return 1 diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index e3facd3d9599b..c100f315e9a19 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -15,7 +15,6 @@ from cython cimport Py_ssize_t import_datetime() import numpy as np -import pytz cimport numpy as cnp from numpy cimport ( @@ -196,8 +195,8 @@ def tz_localize_to_utc( NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns, ): """ - Localize tzinfo-naive i8 to given time zone (using pytz). If - there are ambiguities in the values, raise AmbiguousTimeError. + Localize tzinfo-naive i8 to given time zone. If + there are ambiguities in the values, raise ValueError. Parameters ---------- @@ -368,7 +367,7 @@ timedelta-like} result[i] = NPY_NAT else: stamp = _render_tstamp(val, creso=creso) - raise pytz.AmbiguousTimeError( + raise ValueError( f"Cannot infer dst time from {stamp}, try using the " "'ambiguous' argument" ) @@ -428,7 +427,10 @@ timedelta-like} result[i] = NPY_NAT else: stamp = _render_tstamp(val, creso=creso) - raise pytz.NonExistentTimeError(stamp) + raise ValueError( + f"{stamp} is a nonexistent time due to daylight savings time. " + "Try using the 'nonexistent' argument." + ) return result.base # .base to get underlying ndarray @@ -631,7 +633,7 @@ cdef ndarray[int64_t] _get_dst_hours( if trans_idx.size == 1: # see test_tz_localize_to_utc_ambiguous_infer stamp = _render_tstamp(vals[trans_idx[0]], creso=creso) - raise pytz.AmbiguousTimeError( + raise ValueError( f"Cannot infer dst time from {stamp} as there " "are no repeated times" ) @@ -653,14 +655,16 @@ cdef ndarray[int64_t] _get_dst_hours( if grp.size == 1 or np.all(delta > 0): # see test_tz_localize_to_utc_ambiguous_infer stamp = _render_tstamp(vals[grp[0]], creso=creso) - raise pytz.AmbiguousTimeError(stamp) + raise ValueError( + f"{stamp} is an ambiguous time and cannot be inferred." + ) # Find the index for the switch and pull from a for dst and b # for standard switch_idxs = (delta <= 0).nonzero()[0] if switch_idxs.size > 1: # see test_tz_localize_to_utc_ambiguous_infer - raise pytz.AmbiguousTimeError( + raise ValueError( f"There are {switch_idxs.size} dst switches when " "there should only be 1." ) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 288559d386a71..756c209661fbb 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -33,6 +33,7 @@ pa_version_under14p1, pa_version_under16p0, pa_version_under17p0, + pa_version_under18p0, ) if TYPE_CHECKING: @@ -157,6 +158,7 @@ def is_ci_environment() -> bool: "pa_version_under14p1", "pa_version_under16p0", "pa_version_under17p0", + "pa_version_under18p0", "HAS_PYARROW", "IS64", "ISMUSL", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 06082e71af32a..6b90389a62056 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -43,6 +43,7 @@ "pyreadstat": "1.2.0", "pytest": "7.3.2", "python-calamine": "0.1.7", + "pytz": "2023.4", "pyxlsb": "1.0.10", "s3fs": "2022.11.0", "scipy": "1.10.0", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index ebfc0d69d9655..bd009b544f31e 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,6 +17,7 @@ pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") + pa_version_under18p0 = _palv < Version("18.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -28,4 +29,5 @@ pa_version_under15p0 = True pa_version_under16p0 = True pa_version_under17p0 = True + pa_version_under18p0 = True HAS_PYARROW = False diff --git a/pandas/conftest.py b/pandas/conftest.py index 7c485515f0784..d11213f1164bc 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -32,7 +32,10 @@ import gc import operator import os -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, +) import uuid from dateutil.tz import ( @@ -43,11 +46,8 @@ from hypothesis import strategies as st import numpy as np import pytest -from pytz import ( - FixedOffset, - utc, -) +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( @@ -92,12 +92,7 @@ del pa has_pyarrow = True -import zoneinfo - -try: - zoneinfo.ZoneInfo("UTC") -except zoneinfo.ZoneInfoNotFoundError: - zoneinfo = None # type: ignore[assignment] +pytz = import_optional_dependency("pytz", errors="ignore") # ---------------------------------------------------------------- @@ -1199,19 +1194,19 @@ def deco(*args): "UTC-02:15", tzutc(), tzlocal(), - FixedOffset(300), - FixedOffset(0), - FixedOffset(-300), timezone.utc, timezone(timedelta(hours=1)), timezone(timedelta(hours=-1), name="foo"), ] -if zoneinfo is not None: +if pytz is not None: TIMEZONES.extend( - [ - zoneinfo.ZoneInfo("US/Pacific"), # type: ignore[list-item] - zoneinfo.ZoneInfo("UTC"), # type: ignore[list-item] - ] + ( + pytz.FixedOffset(300), + pytz.FixedOffset(0), + pytz.FixedOffset(-300), + pytz.timezone("US/Pacific"), + pytz.timezone("UTC"), + ) ) TIMEZONE_IDS = [repr(i) for i in TIMEZONES] @@ -1234,9 +1229,10 @@ def tz_aware_fixture(request): return request.param -_UTCS = ["utc", "dateutil/UTC", utc, tzutc(), timezone.utc] -if zoneinfo is not None: - _UTCS.append(zoneinfo.ZoneInfo("UTC")) +_UTCS = ["utc", "dateutil/UTC", tzutc(), timezone.utc] + +if pytz is not None: + _UTCS.append(pytz.utc) @pytest.fixture(params=_UTCS) @@ -2046,12 +2042,12 @@ def using_infer_string() -> bool: return pd.options.future.infer_string is True -warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] -if zoneinfo is not None: - warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type] +_warsaws: list[Any] = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] +if pytz is not None: + _warsaws.append(pytz.timezone("Europe/Warsaw")) -@pytest.fixture(params=warsaws) +@pytest.fixture(params=_warsaws) def warsaw(request) -> str: """ tzinfo for Europe/Warsaw using pytz, dateutil, or zoneinfo. diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 948836bf6a51d..56f8adda93251 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1529,9 +1529,7 @@ def safe_sort( order2 = sorter.argsort() if verify: mask = (codes < -len(values)) | (codes >= len(values)) - codes[mask] = 0 - else: - mask = None + codes[mask] = -1 new_codes = take_nd(order2, codes, fill_value=-1) else: reverse_indexer = np.empty(len(sorter), dtype=int) @@ -1540,14 +1538,6 @@ def safe_sort( # may deal with them here without performance loss using `mode='wrap'` new_codes = reverse_indexer.take(codes, mode="wrap") - if use_na_sentinel: - mask = codes == -1 - if verify: - mask = mask | (codes < -len(values)) | (codes >= len(values)) - - if use_na_sentinel and mask is not None: - np.putmask(new_codes, mask, -1) - return ordered, ensure_platform_int(new_codes) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 5c933294fb944..b2f78182b9bf0 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -94,9 +94,9 @@ def quantile_with_mask( flat = np.array([fill_value] * len(qs)) result = np.repeat(flat, len(values)).reshape(len(values), len(qs)) else: - result = _nanpercentile( + result = _nanquantile( values, - qs * 100.0, + qs, na_value=fill_value, mask=mask, interpolation=interpolation, @@ -108,7 +108,7 @@ def quantile_with_mask( return result -def _nanpercentile_1d( +def _nanquantile_1d( values: np.ndarray, mask: npt.NDArray[np.bool_], qs: npt.NDArray[np.float64], @@ -116,7 +116,7 @@ def _nanpercentile_1d( interpolation: str, ) -> Scalar | np.ndarray: """ - Wrapper for np.percentile that skips missing values, specialized to + Wrapper for np.quantile that skips missing values, specialized to 1-dimensional case. Parameters @@ -142,7 +142,7 @@ def _nanpercentile_1d( # equiv: 'np.array([na_value] * len(qs))' but much faster return np.full(len(qs), na_value) - return np.percentile( + return np.quantile( values, qs, # error: No overload variant of "percentile" matches argument @@ -152,7 +152,7 @@ def _nanpercentile_1d( ) -def _nanpercentile( +def _nanquantile( values: np.ndarray, qs: npt.NDArray[np.float64], *, @@ -161,7 +161,7 @@ def _nanpercentile( interpolation: str, ): """ - Wrapper for np.percentile that skips missing values. + Wrapper for np.quantile that skips missing values. Parameters ---------- @@ -180,7 +180,7 @@ def _nanpercentile( if values.dtype.kind in "mM": # need to cast to integer to avoid rounding errors in numpy - result = _nanpercentile( + result = _nanquantile( values.view("i8"), qs=qs, na_value=na_value.view("i8"), @@ -196,7 +196,7 @@ def _nanpercentile( # Caller is responsible for ensuring mask shape match assert mask.shape == values.shape result = [ - _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation) + _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation) for (val, m) in zip(list(values), list(mask)) ] if values.dtype.kind == "f": @@ -215,7 +215,7 @@ def _nanpercentile( result = result.astype(values.dtype, copy=False) return result else: - return np.percentile( + return np.quantile( values, qs, axis=1, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d07bfeda50e1d..e95fa441e18fb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -709,7 +709,13 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: if isinstance( other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): - result = pc_func(self._pa_array, self._box_pa(other)) + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except pa.ArrowNotImplementedError: + # TODO: could this be wrong if other is object dtype? + # in which case we need to operate pointwise? + result = ops.invalid_comparison(self, other, op) + result = pa.array(result, type=pa.bool_()) elif is_scalar(other): try: result = pc_func(self._pa_array, self._box_pa(other)) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ad0bde3abbdd4..fbe1677b95b33 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -19,6 +19,7 @@ import numpy as np +from pandas._config import using_string_dtype from pandas._config.config import get_option from pandas._libs import ( @@ -1759,6 +1760,10 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: dtype='object') """ result = self._format_native_types(date_format=date_format, na_rep=np.nan) + if using_string_dtype(): + from pandas import StringDtype + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result.astype(object, copy=False) @@ -1781,7 +1786,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: a non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous + - 'raise' will raise a ValueError if there are ambiguous times. nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' @@ -1794,7 +1799,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: closest existing time - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are + - 'raise' will raise a ValueError if there are nonexistent times. Returns diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index dddfc440109d3..201c449185057 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -15,6 +15,7 @@ import numpy as np +from pandas._config import using_string_dtype from pandas._config.config import get_option from pandas._libs import ( @@ -158,15 +159,8 @@ def f(self): # these return a boolean by-definition return result - if field in self._object_ops: - result = fields.get_date_name_field(values, field, reso=self._creso) - result = self._maybe_mask_results(result, fill_value=None) - - else: - result = fields.get_date_field(values, field, reso=self._creso) - result = self._maybe_mask_results( - result, fill_value=None, convert="float64" - ) + result = fields.get_date_field(values, field, reso=self._creso) + result = self._maybe_mask_results(result, fill_value=None, convert="float64") return result @@ -243,7 +237,6 @@ def _scalar_type(self) -> type[Timestamp]: "is_year_end", "is_leap_year", ] - _object_ops: list[str] = ["freq", "tz"] _field_ops: list[str] = [ "year", "month", @@ -264,7 +257,7 @@ def _scalar_type(self) -> type[Timestamp]: ] _other_ops: list[str] = ["date", "time", "timetz"] _datetimelike_ops: list[str] = ( - _field_ops + _object_ops + _bool_ops + _other_ops + ["unit"] + _field_ops + _bool_ops + _other_ops + ["unit", "freq", "tz"] ) _datetimelike_methods: list[str] = [ "to_period", @@ -972,7 +965,7 @@ def tz_localize( non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous + - 'raise' will raise a ValueError if there are ambiguous times. nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ @@ -986,7 +979,7 @@ def tz_localize( closest existing time - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are + - 'raise' will raise a ValueError if there are nonexistent times. Returns @@ -1340,6 +1333,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: values, "month_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result def day_name(self, locale=None) -> npt.NDArray[np.object_]: @@ -1401,6 +1401,14 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: values, "day_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + # TODO: no tests that check for dtype of result as of 2024-08-15 + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result @property diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 07eb91e0cb13b..03712f75db0c7 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -557,7 +557,3 @@ def _wrap_ndarray_result(self, result: np.ndarray): return TimedeltaArray._simple_new(result, dtype=result.dtype) return type(self)(result) - - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8a4fd9fc1b34d..823084c3e9982 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -140,12 +140,16 @@ def __init__( # infer defaults if storage is None: if na_value is not libmissing.NA: - if HAS_PYARROW: - storage = "pyarrow" - else: - storage = "python" + storage = get_option("mode.string_storage") + if storage == "auto": + if HAS_PYARROW: + storage = "pyarrow" + else: + storage = "python" else: storage = get_option("mode.string_storage") + if storage == "auto": + storage = "python" if storage == "pyarrow_numpy": # TODO raise a deprecation warning @@ -346,6 +350,55 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: raise ValueError return cls._from_sequence(scalars, dtype=dtype) + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if self.dtype.na_value is np.nan: + return self._str_map_nan_semantics(f, na_value=na_value, dtype=dtype) + + from pandas.arrays import BooleanArray + + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray | BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + elif dtype == np.dtype("bool"): + # GH#55736 + na_value = bool(na_value) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + # error: Argument 1 to "dtype" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected + # "Type[object]" + dtype=np.dtype(cast(type, dtype)), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + def _str_map_str_or_object( self, dtype, @@ -353,7 +406,6 @@ def _str_map_str_or_object( arr: np.ndarray, f, mask: npt.NDArray[np.bool_], - convert: bool, ): # _str_map helper for case where dtype is either string dtype or object if is_string_dtype(dtype) and not is_object_dtype(dtype): @@ -377,6 +429,45 @@ def _str_map_str_or_object( # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) + def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + if is_integer_dtype(dtype): + na_value = 0 + else: + na_value = True + + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + if na_value_is_na and mask.any(): + # TODO: we could alternatively do this check before map_infer_mask + # and adjust the dtype/na_value we pass there. Which is more + # performant? + if is_integer_dtype(dtype): + result = result.astype("float64") + else: + result = result.astype("object") + result[mask] = np.nan + return result + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" @@ -655,6 +746,12 @@ def _reduce( axis: AxisInt | None = 0, **kwargs, ): + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if name == "any": + return nanops.nanany(self._ndarray, skipna=skipna) + else: + return nanops.nanall(self._ndarray, skipna=skipna) + if name in ["min", "max"]: result = getattr(self, name)(skipna=skipna, axis=axis) if keepdims: @@ -663,6 +760,12 @@ def _reduce( raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: + if self.dtype.na_value is np.nan and result is libmissing.NA: + # the masked_reductions use pd.NA -> convert to np.nan + return np.nan + return super()._wrap_reduction_result(axis, result) + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) result = masked_reductions.min( @@ -680,8 +783,11 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts - result = value_counts(self._ndarray, sort=False, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, sort=False, dropna=dropna) result.index = result.index.astype(self.dtype) + + if self.dtype.na_value is libmissing.NA: + result = result.astype("Int64") return result def memory_usage(self, deep: bool = False) -> int: @@ -732,104 +838,15 @@ def _cmp_method(self, other, op): # logical result = np.zeros(len(self._ndarray), dtype="bool") result[valid] = op(self._ndarray[valid], other) - return BooleanArray(result, mask) - - _arith_method = _cmp_method - - # ------------------------------------------------------------------------ - # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "NumpyExtensionArray" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - - def _str_map_nan_semantics( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - convert = convert and not np.all(mask) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - na_value_is_na = isna(na_value) - if na_value_is_na: - if is_integer_dtype(dtype): - na_value = 0 + res_arr = BooleanArray(result, mask) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return res_arr.to_numpy(np.bool_, na_value=True) else: - na_value = True + return res_arr.to_numpy(np.bool_, na_value=False) + return res_arr - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(cast(type, dtype)), - ) - if na_value_is_na and mask.any(): - if is_integer_dtype(dtype): - result = result.astype("float64") - else: - result = result.astype("object") - result[mask] = np.nan - return result - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if self.dtype.na_value is np.nan: - return self._str_map_nan_semantics( - f, na_value=na_value, dtype=dtype, convert=convert - ) - - from pandas.arrays import BooleanArray - - if dtype is None: - dtype = StringDtype(storage="python") - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray | BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - elif dtype == np.dtype("bool"): - na_value = bool(na_value) - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(cast(type, dtype)), - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + _arith_method = _cmp_method class StringArrayNumpySemantics(StringArray): @@ -861,38 +878,3 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: # need to override NumpyExtensionArray._from_backing_data to ensure # we always preserve the dtype return NDArrayBacked._from_backing_data(self, arr) - - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if name == "any": - return nanops.nanany(self._ndarray, skipna=skipna) - else: - return nanops.nanall(self._ndarray, skipna=skipna) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) - - def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: - # the masked_reductions use pd.NA - if result is libmissing.NA: - return np.nan - return super()._wrap_reduction_result(axis, result) - - def _cmp_method(self, other, op): - result = super()._cmp_method(other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas.core.algorithms import value_counts_internal as value_counts - - result = value_counts(self._ndarray, sort=False, dropna=dropna) - result.index = result.index.astype(self.dtype) - return result - - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4893883d3ad12..67114815341b6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,12 +1,10 @@ from __future__ import annotations -from functools import partial import operator import re from typing import ( TYPE_CHECKING, Union, - cast, ) import numpy as np @@ -23,8 +21,6 @@ ) from pandas.core.dtypes.common import ( - is_bool_dtype, - is_integer_dtype, is_scalar, pandas_dtype, ) @@ -39,7 +35,6 @@ BaseStringArray, StringDtype, ) -from pandas.core.ops import invalid_comparison from pandas.core.strings.object_array import ObjectStringArrayMixin if not pa_version_under10p1: @@ -133,18 +128,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr def __init__(self, values) -> None: _chk_pyarrow_available() - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( - values.type + if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( + pa.types.is_string(values.type) + or ( + pa.types.is_dictionary(values.type) + and ( + pa.types.is_string(values.type.value_type) + or pa.types.is_large_string(values.type.value_type) + ) + ) ): values = pc.cast(values, pa.large_string()) super().__init__(values) self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) - if not pa.types.is_large_string(self._pa_array.type) and not ( - pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_large_string(self._pa_array.type.value_type) - ): + if not pa.types.is_large_string(self._pa_array.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of " "large_string type" @@ -216,12 +215,17 @@ def dtype(self) -> StringDtype: # type: ignore[override] return self._dtype def insert(self, loc: int, item) -> ArrowStringArray: + if self.dtype.na_value is np.nan and item is np.nan: + item = libmissing.NA if not isinstance(item, str) and item is not libmissing.NA: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - @classmethod - def _result_converter(cls, values, na=None): + def _result_converter(self, values, na=None): + if self.dtype.na_value is np.nan: + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -275,102 +279,7 @@ def astype(self, dtype, copy: bool = True): # ------------------------------------------------------------------------ # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "ObjectStringArrayMixin" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - - def _str_map_nan_semantics( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - if is_integer_dtype(dtype): - na_value = np.nan - else: - na_value = False - - dtype = np.dtype(cast(type, dtype)) - if mask.any(): - # numpy int/bool dtypes cannot hold NaNs so we must convert to - # float64 for int (to match maybe_convert_objects) or - # object for bool (again to match maybe_convert_objects) - if is_integer_dtype(dtype): - dtype = np.dtype("float64") - else: - dtype = np.dtype(object) - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=dtype, - ) - return result - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if self.dtype.na_value is np.nan: - return self._str_map_nan_semantics( - f, na_value=na_value, dtype=dtype, convert=convert - ) - - # TODO: de-duplicate with StringArray method. This method is moreless copy and - # paste. - - from pandas.arrays import ( - BooleanArray, - IntegerArray, - ) - - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray | BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(cast(type, dtype)), - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + _str_map = BaseStringArray._str_map def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -587,11 +496,30 @@ def _str_get_dummies(self, sep: str = "|"): return dummies.astype(np.int64, copy=False), labels def _convert_int_dtype(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + return Int64Dtype().__from_arrow__(result) def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if not skipna: + nas = pc.is_null(self._pa_array) + arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) if name in ("argmin", "argmax") and isinstance(result, pa.Array): return self._convert_int_dtype(result) @@ -622,70 +550,31 @@ def _rank( ) ) - -class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow" - _na_value = np.nan - - @classmethod - def _result_converter(cls, values, na=None): - if not isna(na): - values = values.fill_null(bool(na)) - return ArrowExtensionArray(values).to_numpy(na_value=np.nan) - - def __getattribute__(self, item): - # ArrowStringArray and we both inherit from ArrowExtensionArray, which - # creates inheritance problems (Diamond inheritance) - if item in ArrowStringArrayMixin.__dict__ and item not in ( - "_pa_array", - "__dict__", - ): - return partial(getattr(ArrowStringArrayMixin, item), self) - return super().__getattribute__(item) - - def _convert_int_dtype(self, result): - if isinstance(result, pa.Array): - result = result.to_numpy(zero_copy_only=False) - else: - result = result.to_numpy() - if result.dtype == np.int32: - result = result.astype(np.int64) + def value_counts(self, dropna: bool = True) -> Series: + result = super().value_counts(dropna=dropna) + if self.dtype.na_value is np.nan: + res_values = result._values.to_numpy() + return result._constructor( + res_values, index=result.index, name=result.name, copy=False + ) return result def _cmp_method(self, other, op): - try: - result = super()._cmp_method(other, op) - except pa.ArrowNotImplementedError: - return invalid_comparison(self, other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas import Series - - result = super().value_counts(dropna) - return Series( - result._values.to_numpy(), index=result.index, name=result.name, copy=False - ) - - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if not skipna: - nas = pc.is_null(self._pa_array) - arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) + result = super()._cmp_method(other, op) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) else: - arr = pc.not_equal(self._pa_array, "") - return ArrowExtensionArray(arr)._reduce( - name, skipna=skipna, keepdims=keepdims, **kwargs - ) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + return result.to_numpy(np.bool_, na_value=False) + return result - def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: - if item is np.nan: - item = libmissing.NA - return super().insert(loc, item) # type: ignore[return-value] + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _na_value = np.nan + _str_get = ArrowStringArrayMixin._str_get + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_pad = ArrowStringArrayMixin._str_pad + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 83cc2871f5459..b2cfbe7338c0d 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -152,9 +152,8 @@ def _scalar_type(self) -> type[Timedelta]: # define my properties & methods for delegation _other_ops: list[str] = [] _bool_ops: list[str] = [] - _object_ops: list[str] = ["freq"] _field_ops: list[str] = ["days", "seconds", "microseconds", "nanoseconds"] - _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + ["unit"] + _datetimelike_ops: list[str] = _field_ops + _bool_ops + ["unit", "freq"] _datetimelike_methods: list[str] = [ "to_pytimedelta", "total_seconds", diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index e62cda0dfe8d0..e4eefb570fd95 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -452,13 +452,12 @@ def is_terminal() -> bool: string_storage_doc = """ : string - The default storage for StringDtype. This option is ignored if - ``future.infer_string`` is set to True. + The default storage for StringDtype. """ def is_valid_string_storage(value: Any) -> None: - legal_values = ["python", "pyarrow"] + legal_values = ["auto", "python", "pyarrow"] if value not in legal_values: msg = "Value must be one of python|pyarrow" if value == "pyarrow_numpy": @@ -473,7 +472,7 @@ def is_valid_string_storage(value: Any) -> None: with cf.config_prefix("mode"): cf.register_option( "string_storage", - "python", + "auto", string_storage_doc, # validator=is_one_of_factory(["python", "pyarrow"]), validator=is_valid_string_storage, diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 162f6a4d30f3f..3394bf091e228 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1014,10 +1014,8 @@ def convert_dtypes( Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). + * ``"numpy_nullable"``: returns nullable-dtype * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. .. versionadded:: 2.0 @@ -1025,6 +1023,8 @@ def convert_dtypes( ------- np.dtype, or ExtensionDtype """ + from pandas.core.arrays.string_ import StringDtype + inferred_dtype: str | DtypeObj if ( @@ -1103,6 +1103,13 @@ def convert_dtypes( # If we couldn't do anything else, then we retain the dtype inferred_dtype = input_array.dtype + elif ( + convert_string + and isinstance(input_array.dtype, StringDtype) + and input_array.dtype.na_value is np.nan + ): + inferred_dtype = pandas_dtype_func("string") + else: inferred_dtype = input_array.dtype diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 64b5278424192..bcf1ade9b0320 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1274,6 +1274,10 @@ def is_bool_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a boolean dtype. + This function verifies whether a given object is a boolean data type. The input + can be an array or a dtype object. Accepted array types include instances + of ``np.array``, ``pd.Series``, ``pd.Index``, and similar array-like structures. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -1284,6 +1288,10 @@ def is_bool_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a boolean dtype. + See Also + -------- + api.types.is_bool : Check if an object is a boolean. + Notes ----- An ExtensionArray is considered boolean when the ``_is_boolean`` diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 3aeab96e03163..c0587d36bcb5a 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -18,9 +18,9 @@ cast, ) import warnings +import zoneinfo import numpy as np -import pytz from pandas._config.config import get_option @@ -789,7 +789,7 @@ def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None) -> None: tz = timezones.maybe_get_tz(tz) tz = timezones.tz_standardize(tz) elif tz is not None: - raise pytz.UnknownTimeZoneError(tz) + raise zoneinfo.ZoneInfoNotFoundError(tz) if tz is None: raise TypeError("A 'tz' is required.") @@ -882,7 +882,7 @@ def construct_from_string(cls, string: str_type) -> DatetimeTZDtype: return cls(unit=d["unit"], tz=d["tz"]) except (KeyError, TypeError, ValueError) as err: # KeyError if maybe_get_tz tries and fails to get a - # pytz timezone (actually pytz.UnknownTimeZoneError). + # zoneinfo timezone (actually zoneinfo.ZoneInfoNotFoundError). # TypeError if we pass a nonsense tz; # ValueError if we pass a unit other than "ns" raise TypeError(msg) from err diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b8039746d9952..1e6608b0d87f3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6406,7 +6406,7 @@ def dropna( thresh : int, optional Require that many non-NA values. Cannot be combined with how. - subset : column label or sequence of labels, optional + subset : column label or iterable of labels, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. inplace : bool, default False @@ -6536,7 +6536,7 @@ def dropna( @overload def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = ..., + subset: Hashable | Iterable[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: Literal[True], @@ -6546,7 +6546,7 @@ def drop_duplicates( @overload def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = ..., + subset: Hashable | Iterable[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: Literal[False] = ..., @@ -6556,7 +6556,7 @@ def drop_duplicates( @overload def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = ..., + subset: Hashable | Iterable[Hashable] | None = ..., *, keep: DropKeep = ..., inplace: bool = ..., @@ -6565,7 +6565,7 @@ def drop_duplicates( def drop_duplicates( self, - subset: Hashable | Sequence[Hashable] | None = None, + subset: Hashable | Iterable[Hashable] | None = None, *, keep: DropKeep = "first", inplace: bool = False, @@ -6579,7 +6579,7 @@ def drop_duplicates( Parameters ---------- - subset : column label or sequence of labels, optional + subset : column label or iterable of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', ``False``}, default 'first' @@ -6669,7 +6669,7 @@ def drop_duplicates( def duplicated( self, - subset: Hashable | Sequence[Hashable] | None = None, + subset: Hashable | Iterable[Hashable] | None = None, keep: DropKeep = "first", ) -> Series: """ @@ -6679,7 +6679,7 @@ def duplicated( Parameters ---------- - subset : column label or sequence of labels, optional + subset : column label or iterable of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : {'first', 'last', False}, default 'first' @@ -6771,10 +6771,7 @@ def f(vals) -> tuple[np.ndarray, int]: return labels.astype("i8"), len(shape) if subset is None: - # https://github.com/pandas-dev/pandas/issues/28770 - # Incompatible types in assignment (expression has type "Index", variable - # has type "Sequence[Any]") - subset = self.columns # type: ignore[assignment] + subset = self.columns elif ( not np.iterable(subset) or isinstance(subset, str) @@ -6795,7 +6792,7 @@ def f(vals) -> tuple[np.ndarray, int]: if len(subset) == 1 and self.columns.is_unique: # GH#45236 This is faster than get_group_index below - result = self[subset[0]].duplicated(keep) + result = self[next(iter(subset))].duplicated(keep) result.name = None else: vals = (col.values for name, col in self.items() if name in subset) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8a6fc69d47cc3..0f0078fc3398b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6670,10 +6670,10 @@ def convert_dtypes( Back-end data type applied to the resultant :class:`DataFrame` or :class:`Series` (still experimental). Behaviour is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - or :class:`Series` (default). + * ``"numpy_nullable"``: returns nullable-dtype-backed + :class:`DataFrame` or :class:`Serires`. * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame or Series. + :class:`DataFrame` or :class:`Series`. .. versionadded:: 2.0 @@ -10570,7 +10570,7 @@ def tz_localize( a non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous + - 'raise' will raise a ValueError if there are ambiguous times. nonexistent : str, default 'raise' A nonexistent time does not exist in a particular timezone @@ -10582,7 +10582,7 @@ def tz_localize( closest existing time - 'NaT' will return NaT where there are nonexistent times - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are + - 'raise' will raise a ValueError if there are nonexistent times. Returns diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 00a929724ed4c..3b3cda8f7cd33 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -6,7 +6,6 @@ import warnings import numpy as np -import pytz from pandas._libs import ( NaT, @@ -162,7 +161,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): non-DST time (note that this flag is only applicable for ambiguous times) - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. + - 'raise' will raise a ValueError if there are ambiguous times. dayfirst : bool, default False If True, parse dates in `data` with the day first order. yearfirst : bool, default False @@ -264,7 +263,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: @doc(DatetimeArray.strftime) def strftime(self, date_format) -> Index: arr = self._data.strftime(date_format) - return Index(arr, name=self.name, dtype=object) + return Index(arr, name=self.name, dtype=arr.dtype) @doc(DatetimeArray.tz_convert) def tz_convert(self, tz) -> Self: @@ -591,7 +590,7 @@ def get_loc(self, key): elif isinstance(key, str): try: parsed, reso = self._parse_with_reso(key) - except (ValueError, pytz.NonExistentTimeError) as err: + except ValueError as err: raise KeyError(key) from err self._disallow_mismatched_indexing(parsed) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 48d5e59250f35..2eeacfb769be4 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -74,7 +74,7 @@ def fget(self): return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result def fset(self, value) -> None: @@ -101,7 +101,7 @@ def method(self, *args, **kwargs): # type: ignore[misc] return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result # error: "property" has no attribute "__name__" diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0900121ab717f..c3d4ad721c830 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1636,6 +1636,17 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: doc=""" Names of levels in MultiIndex. + This attribute provides access to the names of the levels in a `MultiIndex`. + The names are stored as a `FrozenList`, which is an immutable list-like + container. Each name corresponds to a level in the `MultiIndex`, and can be + used to identify or manipulate the levels individually. + + See Also + -------- + MultiIndex.set_names : Set Index or MultiIndex name. + MultiIndex.rename : Rename specific levels in a MultiIndex. + Index.names : Get names on index. + Examples -------- >>> mi = pd.MultiIndex.from_arrays( @@ -2681,8 +2692,15 @@ def sortlevel( """ Sort MultiIndex at the requested level. - The result will respect the original ordering of the associated - factor at that level. + This method is useful when dealing with MultiIndex objects, allowing for + sorting at a specific level of the index. The function preserves the + relative ordering of data within the same level while sorting + the overall MultiIndex. The method provides flexibility with the `ascending` + parameter to define the sort order and with the `sort_remaining` parameter to + control whether the remaining levels should also be sorted. Sorting a + MultiIndex can be crucial when performing operations that require ordered + indices, such as grouping or merging datasets. The `na_position` argument is + important in handling missing values consistently across different levels. Parameters ---------- @@ -2692,7 +2710,9 @@ def sortlevel( ascending : bool, default True False to sort in descending order. Can also be a list to specify a directed ordering. - sort_remaining : sort by the remaining levels after level + sort_remaining : bool, default True + If True, sorts by the remaining levels after sorting by the specified + `level`. na_position : {'first' or 'last'}, default 'first' Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. @@ -2706,6 +2726,13 @@ def sortlevel( indexer : np.ndarray[np.intp] Indices of output values in original index. + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Index.sort_values : Sort Index values. + DataFrame.sort_index : Sort DataFrame by the index. + Series.sort_index : Sort Series by the index. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]]) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 149bef6258bfa..dfb96162f0ac1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -512,7 +512,11 @@ def convert(self) -> list[Block]: convert_non_numeric=True, ) refs = None - if res_values is values: + if ( + res_values is values + or isinstance(res_values, NumpyExtensionArray) + and res_values._ndarray is values + ): refs = self.refs res_values = ensure_block_shape(res_values, self.ndim) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 6836ba3f65691..c005a1ce26e4b 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -379,6 +379,11 @@ def concat( 0 1 2 1 3 4 """ + if ignore_index and keys is not None: + raise ValueError( + f"Cannot set {ignore_index=} and specify keys. Either should be used." + ) + if copy is not lib.no_default: warnings.warn( "The copy keyword is deprecated and will be removed in a future " diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 18517199f073c..b3f946f289891 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -358,7 +358,16 @@ def qcut( x_idx = _preprocess_for_cut(x) x_idx, _ = _coerce_to_type(x_idx) - quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q + if is_integer(q): + quantiles = np.linspace(0, 1, q + 1) + # Round up rather than to nearest if not representable in base 2 + np.putmask( + quantiles, + q * quantiles != np.arange(q + 1), + np.nextafter(quantiles, 1), + ) + else: + quantiles = q bins = x_idx.to_series().dropna().quantile(quantiles) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 8e6183c43480f..1014c9559afaf 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -969,6 +969,8 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False): Returns ------- DataFrame/MultiIndex or Series/Index of objects + Returns appropriate type based on `expand` parameter with strings + split based on the `sep` parameter. See Also -------- @@ -1749,6 +1751,18 @@ def pad( Returns ------- Series/Index of objects. + A Series or Index where the strings are modified by :meth:`str.%(method)s`. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. + Series.str.center : Fills both sides of strings with an arbitrary + character. + Series.str.zfill : Pad strings in the Series/Index by prepending '0' + character. Examples -------- @@ -2024,11 +2038,19 @@ def decode(self, encoding, errors: str = "strict"): Parameters ---------- encoding : str + Specifies the encoding to be used. errors : str, optional + Specifies the error handling scheme. + Possible values are those supported by :meth:`bytes.decode`. Returns ------- Series or Index + A Series or Index with decoded strings. + + See Also + -------- + Series.str.encode : Encodes strings into bytes in a Series/Index. Examples -------- @@ -2063,11 +2085,19 @@ def encode(self, encoding, errors: str = "strict"): Parameters ---------- encoding : str + Specifies the encoding to be used. errors : str, optional + Specifies the error handling scheme. + Possible values are those supported by :meth:`str.encode`. Returns ------- Series/Index of objects + A Series or Index with strings encoded into bytes. + + See Also + -------- + Series.str.decode : Decodes bytes into strings in a Series/Index. Examples -------- @@ -2099,6 +2129,7 @@ def encode(self, encoding, errors: str = "strict"): Returns ------- Series or Index of object + Series or Index with the strings being stripped from the %(side)s. See Also -------- @@ -3092,6 +3123,8 @@ def normalize(self, form): Returns ------- Series or Index of object + Returns a Series or an Index of the %(side)s indexes + in each string of the input. See Also -------- @@ -3207,7 +3240,8 @@ def len(self): Returns ------- - Series or Index of object + Series or Index of objects + A Series or Index where the strings are modified by :meth:`str.%(method)s`. See Also -------- diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 290a28ab60ae1..100afa956bd24 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -37,8 +37,6 @@ class ObjectStringArrayMixin(BaseStringArrayMethods): String Methods operating on object-dtype ndarrays. """ - _str_na_value = np.nan - def __len__(self) -> int: # For typing, _str_map relies on the object being sized. raise NotImplementedError @@ -56,7 +54,7 @@ def _str_map( na_value : Scalar, optional The value to set for NA values. Might also be used for the fill value if the callable `f` raises an exception. - This defaults to ``self._str_na_value`` which is ``np.nan`` + This defaults to ``self.dtype.na_value`` which is ``np.nan`` for object-dtype and Categorical and ``pd.NA`` for StringArray. dtype : Dtype, optional The dtype of the result array. @@ -66,7 +64,7 @@ def _str_map( if dtype is None: dtype = np.dtype("object") if na_value is None: - na_value = self._str_na_value + na_value = self.dtype.na_value # type: ignore[attr-defined] if not len(self): return np.array([], dtype=dtype) @@ -272,7 +270,7 @@ def f(x): return x.get(i) elif len(x) > i >= -len(x): return x[i] - return self._str_na_value + return self.dtype.na_value # type: ignore[attr-defined] return self._str_map(f) @@ -466,7 +464,7 @@ def _str_removesuffix(self, suffix: str): def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): regex = re.compile(pat, flags=flags) - na_value = self._str_na_value + na_value = self.dtype.na_value # type: ignore[attr-defined] if not expand: diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 26e73794af298..982851d0557c3 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -99,8 +99,8 @@ def to_numeric( is to not use nullable data types. If specified, the behavior is as follows: - * ``"numpy_nullable"``: returns with nullable-dtype-backed - * ``"pyarrow"``: returns with pyarrow-backed nullable :class:`ArrowDtype` + * ``"numpy_nullable"``: returns nullable-dtype-backed object + * ``"pyarrow"``: returns with pyarrow-backed nullable object .. versionadded:: 2.0 diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 5a0a8c321e629..2ed241f0b9bca 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -38,14 +38,15 @@ def read_clipboard( A string or regex delimiter. The default of ``'\\s+'`` denotes one or more whitespace characters. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f83f9cb1c8d74..ef52107c283e9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -267,14 +267,15 @@ Rows at the end to skip (0-indexed). {storage_options} -dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' +dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -1728,14 +1729,15 @@ def parse( comment string and the end of the current line is ignored. skipfooter : int, default 0 Rows at the end to skip (0-indexed). - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 **kwds : dict, optional diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 3df3e77a851a3..aaae9857b4fae 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -92,14 +92,15 @@ def read_feather( Whether to parallelize reading using multiple threads. {storage_options} - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/html.py b/pandas/io/html.py index 4b8bc48130fab..c9897f628fdc9 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1131,14 +1131,15 @@ def read_html( .. versionadded:: 1.5.0 - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index b29ead1d14b1d..d077b9e0c4568 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -652,14 +652,15 @@ def read_json( {storage_options} - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index d966e38fa11a5..9d250ee5c08ce 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -144,11 +144,11 @@ def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]: field["freq"] = dtype.freq.freqstr elif isinstance(dtype, DatetimeTZDtype): if timezones.is_utc(dtype.tz): - # timezone.utc has no "zone" attr field["tz"] = "UTC" else: - # error: "tzinfo" has no attribute "zone" - field["tz"] = dtype.tz.zone # type: ignore[attr-defined] + zone = timezones.get_timezone(dtype.tz) + if isinstance(zone, str): + field["tz"] = zone elif isinstance(dtype, ExtensionDtype): field["extDtype"] = dtype.name return field diff --git a/pandas/io/orc.py b/pandas/io/orc.py index b297164d5d108..f179dafc919e5 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -61,14 +61,15 @@ def read_orc( Output always follows the ordering of the file and not the columns list. This mirrors the original behaviour of :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 77a9cc3fca644..24415299e799b 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -542,14 +542,15 @@ def read_parquet( .. versionadded:: 1.3.0 - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 0cca1ebdb8c8f..6e933f94cf0ba 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -268,6 +268,18 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). nrows : int, optional Number of rows of file to read. Useful for reading pieces of large files. + Refers to the number of data rows in the returned DataFrame, excluding: + + * The header row containing column names. + * Rows before the header row, if ``header=1`` or larger. + + Example usage: + + * To read the first 999,999 (non-header) rows: + ``read_csv(..., nrows=999999)`` + + * To read rows 1,000,000 through 1,999,999: + ``read_csv(..., skiprows=1000000, nrows=999999)`` na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific per-column ``NA`` values. By default the following values are interpreted as @@ -438,14 +450,14 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): {storage_options} -dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' +dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 313ffa79cbd09..e597463aee453 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -36,14 +36,15 @@ def read_spss( Return a subset of the columns. If None, return all columns. convert_categoricals : bool, default is True Convert categorical columns into pd.Categorical. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed + nullable :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 4fd7de7a28855..99dd06568fa01 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -306,14 +306,15 @@ def read_sql_table( chunksize : int, default None If specified, returns an iterator where `chunksize` is the number of rows to include in each chunk. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -443,14 +444,15 @@ def read_sql_query( {'a': np.float64, 'b': np.int32, 'c': 'Int64'}. .. versionadded:: 1.3.0 - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -586,14 +588,15 @@ def read_sql( chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 dtype : Type name or dict of columns @@ -1683,14 +1686,15 @@ def read_table( chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 @@ -2148,14 +2152,15 @@ def read_table( schema of the SQL database object. chunksize : int, default None Raises NotImplementedError - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 8c7381a926e72..0fcf27af42fde 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -959,14 +959,15 @@ def read_xml( {storage_options} - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"pyarrow"``: returns pyarrow-backed nullable + :class:`ArrowDtype` :class:`DataFrame` .. versionadded:: 2.0 diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index fb7d785a94bc4..9a7e563332a42 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -546,7 +546,7 @@ def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes: new_ax.set_yscale("log") elif self.logy == "sym" or self.loglog == "sym": new_ax.set_yscale("symlog") - return new_ax # type: ignore[return-value] + return new_ax @final @cache_readonly diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index b0475b64a844e..3be3562d23cd6 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -6,6 +6,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -1245,6 +1247,9 @@ def test_agg_multiple_mixed(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_agg_multiple_mixed_raises(): # GH 20909 mdf = DataFrame( diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 3137d3ff50954..ba970e328ae40 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -12,6 +12,9 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW from pandas.errors import SpecificationError from pandas import ( @@ -209,6 +212,10 @@ def transform(row): data.apply(transform, axis=1) +# we should raise a proper TypeError instead of propagating the pyarrow error +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) @pytest.mark.parametrize( "df, func, expected", tm.get_cython_table_params( @@ -229,6 +236,10 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_str df.agg(func, axis=axis) +# we should raise a proper TypeError instead of propagating the pyarrow error +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) @pytest.mark.parametrize( "series, func, expected", chain( diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 6ac0b49f0e4e7..6bbe5100e8826 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -104,6 +104,7 @@ def test_numba_nonunique_unsupported(apply_axis): def test_numba_unsupported_dtypes(apply_axis): + pytest.importorskip("pyarrow") f = lambda x: x df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]}) df["c"] = df["c"].astype("double[pyarrow]") diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 4b5156d0007bb..899ea1910d055 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,6 +8,9 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -315,6 +318,9 @@ def test_add(self): expected = pd.Index(["1a", "1b", "1c"]) tm.assert_index_equal("1" + index, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_sub_fail(self, using_infer_string): index = pd.Index([str(i) for i in range(10)]) diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 0c4fcf149eb20..4dbd8eb9f5ca7 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -3,6 +3,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd import pandas._testing as tm @@ -90,6 +94,9 @@ def test_op_int8(left_array, right_array, opname): # ----------------------------------------------------------------------------- +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): # invalid ops diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index dca33dffa3996..52fd80cd196e0 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -6,7 +6,10 @@ from pandas._config import using_string_dtype -from pandas.compat import PYPY +from pandas.compat import ( + HAS_PYARROW, + PYPY, +) from pandas import ( Categorical, @@ -296,7 +299,9 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) def test_memory_usage(self): cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 6752a503016f8..d7eb6800e5d07 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -8,6 +8,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -442,7 +444,9 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail(using_string_dtype(), reason="Can't be NumPy strings") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings" + ) def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index db04862e4ea07..e485c7f79b475 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -102,9 +104,10 @@ def test_groupby_reductions(op, expected): ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], ) -def test_mixed_reductions(op, expected, using_infer_string): - if op in ["any", "all"] and using_infer_string: - expected = expected.astype("bool") +def test_mixed_reductions(request, op, expected, using_infer_string): + if op in ["any", "all"] and using_infer_string and HAS_PYARROW: + # TODO(infer_string) inconsistent result type + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) df = DataFrame( { "A": ["a", "b", "b"], diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 7d4aae0f7bb4e..cfba32c62f206 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -27,11 +26,10 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage == "python" and HAS_PYARROW: - # string storage with na_value=NaN always uses pyarrow if available - # -> does not yet honor the option - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) +def test_config(string_storage, using_infer_string): + # with the default string_storage setting + # always "python" at the moment + assert StringDtype().storage == "python" with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage @@ -88,19 +86,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): ArrowStringArray(arr) -@pytest.mark.xfail( - reason="dict conversion does not seem to be implemented for large string in arrow" -) +@pytest.mark.parametrize("string_type", ["string", "large_string"]) @pytest.mark.parametrize("chunked", [True, False]) -def test_constructor_valid_string_type_value_dictionary(chunked): +def test_constructor_valid_string_type_value_dictionary(string_type, chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() + arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode() if chunked: arr = pa.chunked_array(arr) arr = ArrowStringArray(arr) - assert pa.types.is_string(arr._pa_array.type.value_type) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) def test_constructor_from_list(): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 5834b268be2be..59ff4f3122e8f 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -891,20 +891,24 @@ def test_concat_same_type_different_freq(self, unit): tm.assert_datetime_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y %b") expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = DatetimeIndex(["2019-01-01", NaT])._data result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) class TestTimedeltaArray(SharedTests): @@ -1161,20 +1165,24 @@ def test_array_interface(self, arr1d): expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y") expected = np.array([per.strftime("%Y") for per in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = PeriodArray(PeriodIndex(["2019-01-01", NaT], dtype="period[D]")) result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index dd6bf3c7521f8..13a3ff048c79e 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -20,6 +24,7 @@ SparseArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics @@ -218,7 +223,9 @@ def test_iter_box_period(self): ) def test_values_consistent(arr, expected_type, dtype, using_infer_string): if using_infer_string and dtype == "object": - expected_type = ArrowStringArrayNumpySemantics + expected_type = ( + ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArrayNumpySemantics + ) l_values = Series(arr)._values r_values = pd.Index(arr)._values assert type(l_values) is expected_type @@ -355,6 +362,9 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): tm.assert_numpy_array_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize( "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 8724f62de1534..de56d5e4a07ee 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -5,6 +5,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -197,7 +198,7 @@ def test_astype_arrow_timestamp(): assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -213,7 +214,7 @@ def test_convert_dtypes_infer_objects(): tm.assert_series_equal(ser, ser_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_convert_dtypes(): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index d2e2d43b0a42b..dd4dd154f74b0 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -3,6 +3,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, Index, @@ -14,7 +16,7 @@ from pandas.tests.copy_view.util import get_array -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_concat_frames(): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -33,7 +35,7 @@ def test_concat_frames(): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_concat_frames_updating_input(): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -153,7 +155,7 @@ def test_concat_copy_keyword(): assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ @@ -249,7 +251,7 @@ def test_merge_copy_keyword(): assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_join_on_key(): df_index = Index(["a", "b", "c"], name="key") @@ -277,7 +279,7 @@ def test_join_on_key(): tm.assert_frame_equal(df2, df2_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_join_multiple_dataframes_on_key(): df_index = Index(["a", "b", "c"], name="key") diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index f80e9b7dcf838..fc57178b897b9 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -3,6 +3,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( NA, DataFrame, @@ -121,7 +123,7 @@ def test_interpolate_cannot_with_object_dtype(): df.interpolate() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_interpolate_object_convert_no_op(): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) arr_a = get_array(df, "a") diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 3716df8fbf855..92e1ba750fae2 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -3,6 +3,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -714,7 +716,7 @@ def test_head_tail(method): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_infer_objects(): df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) df_orig = df.copy() @@ -730,6 +732,9 @@ def test_infer_objects(): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_infer_objects_no_reference(): df = DataFrame( { @@ -899,7 +904,7 @@ def test_sort_values_inplace(obj, kwargs): tm.assert_equal(view, obj_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(decimals): df = DataFrame({"a": [1, 2], "b": "c"}) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index c1120ccfea635..58c979fb05089 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -3,6 +3,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( Categorical, DataFrame, @@ -59,7 +61,7 @@ def test_replace_regex_inplace_refs(): tm.assert_frame_equal(view, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_replace_regex_inplace(): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") @@ -257,7 +259,7 @@ def test_replace_empty_list(): assert not df2._mgr._has_no_reference(0) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(value): df = DataFrame({"a": ["a", "b", "c"]}) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 4bf97b1fd8494..2c2dff7a957fe 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -802,13 +803,17 @@ def test_pandas_dtype_ea_not_instance(): def test_pandas_dtype_string_dtypes(string_storage): - # TODO(infer_string) remove skip if "python" is supported - pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype("str") + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") - # TODO(infer_string) hardcoded to pyarrow until python is supported - assert result == pd.StringDtype("pyarrow", na_value=np.nan) + assert result == pd.StringDtype(string_storage, na_value=np.nan) with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage): diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index fad2560265d21..ff9f3cbed64a2 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -7,6 +7,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.common import is_string_dtype import pandas as pd @@ -140,6 +142,12 @@ class BaseArithmeticOpsTests(BaseOpsUtil): series_array_exc: type[Exception] | None = TypeError divmod_exc: type[Exception] | None = TypeError + # TODO(infer_string) need to remove import of pyarrow + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -149,6 +157,11 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): ser = pd.Series(data) self.check_opname(ser, op_name, ser.iloc[0]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -158,12 +171,22 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): df = pd.DataFrame({"A": data}) self.check_opname(df, op_name, data[0]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op_name = all_arithmetic_operators ser = pd.Series(data) self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser))) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_divmod(self, data): ser = pd.Series(data) self._check_divmod_op(ser, divmod, 1) @@ -179,6 +202,7 @@ def test_divmod_series_array(self, data, data_for_twos): other = pd.Series(other) self._check_divmod_op(other, ops.rdivmod, ser) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_add_series_with_extension_array(self, data): # Check adding an ExtensionArray to a Series of the same dtype matches # the behavior of adding the arrays directly and then wrapping in a diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 8f8af607585df..c3d4b83f731a3 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -140,6 +140,7 @@ def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): # frame & scalar op_name = all_arithmetic_operators @@ -151,6 +152,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): ) super().test_arith_frame_with_scalar(data, op_name) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): op_name = all_arithmetic_operators if op_name == "__rmod__": diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 79cfb736941d6..1b251a5118681 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -19,6 +19,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import NumpyEADtype import pandas as pd @@ -255,6 +257,7 @@ def test_insert_invalid(self, data, invalid_scalar): frame_scalar_exc = None series_array_exc = None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod(self, data): divmod_exc = None if data.dtype.kind == "O": @@ -262,6 +265,7 @@ def test_divmod(self, data): self.divmod_exc = divmod_exc super().test_divmod(data) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod_series_array(self, data): ser = pd.Series(data) exc = None @@ -270,6 +274,7 @@ def test_divmod_series_array(self, data): self.divmod_exc = exc self._check_divmod_op(ser, divmod, data) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators series_scalar_exc = None @@ -283,6 +288,7 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) self.series_scalar_exc = series_scalar_exc super().test_arith_series_with_scalar(data, all_arithmetic_operators) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_array(self, data, all_arithmetic_operators): opname = all_arithmetic_operators series_array_exc = None @@ -291,6 +297,7 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): self.series_array_exc = series_array_exc super().test_arith_series_with_array(data, all_arithmetic_operators) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators frame_scalar_exc = None diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 826ac2be3339b..8ce4e8725d632 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -12,6 +12,7 @@ from pandas._config import using_string_dtype from pandas._libs import iNaT +from pandas.compat import HAS_PYARROW from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_integer @@ -1148,7 +1149,9 @@ def test_loc_setitem_datetimelike_with_inference(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 1d7b3e12b2e86..32a827c25c77a 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -6,6 +6,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -938,6 +940,9 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): obj.mask(mask, null) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) @given(data=OPTIONAL_ONE_OF_ALL) def test_where_inplace_casting(data): # GH 22051 @@ -1018,6 +1023,9 @@ def test_where_producing_ea_cond_for_np_dtype(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 59779234b46d9..e7f6e5d625d3e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,21 +3,15 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm class TestConvertDtypes: - # TODO convert_dtypes should not use NaN variant of string dtype, but always NA - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes( - self, convert_integer, expected, string_storage, using_infer_string - ): + def test_convert_dtypes(self, convert_integer, expected, string_storage): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -182,7 +176,6 @@ def test_convert_dtypes_pyarrow_timestamp(self): result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_avoid_block_splitting(self): # GH#55341 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) @@ -197,7 +190,6 @@ def test_convert_dtypes_avoid_block_splitting(self): tm.assert_frame_equal(result, expected) assert result._mgr.nblocks == 2 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_from_arrow(self): # GH#56581 df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index 419fb75cb3669..7feb3b6fd816d 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -476,3 +476,41 @@ def test_drop_duplicates_non_boolean_ignore_index(arg): msg = '^For argument "ignore_index" expected type bool, received type .*.$' with pytest.raises(ValueError, match=msg): df.drop_duplicates(ignore_index=arg) + + +def test_drop_duplicates_set(): + # GH#59237 + df = DataFrame( + { + "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates({"AAA"}) + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates({"AAA"}, keep="last") + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates({"AAA"}, keep=False) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates({"AAA", "B"}) + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates({"AAA", "B"}, keep="last") + expected = df.loc[[0, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates({"AAA", "B"}, keep=False) + expected = df.loc[[0]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index a4319f8a8ae7f..aad43b7a77ac7 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -10,6 +10,7 @@ from pandas._config import using_string_dtype from pandas.compat import ( + HAS_PYARROW, IS64, PYPY, ) @@ -520,7 +521,7 @@ def test_info_int_columns(): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 79aabbcc83bbf..4e8e267523439 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -6,10 +6,13 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.algos import ( Infinity, NegInfinity, ) +from pandas.compat import HAS_PYARROW from pandas import ( DataFrame, @@ -464,9 +467,18 @@ def test_rank_inf_nans_na_option( ], ) def test_rank_object_first( - self, frame_or_series, na_option, ascending, expected, using_infer_string + self, + request, + frame_or_series, + na_option, + ascending, + expected, + using_infer_string, ): obj = frame_or_series(["foo", "foo", None, "foo"]) + if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series): + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) if using_infer_string and isinstance(obj, Series): diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 4136d641ef67f..7670b53f23173 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd import pandas._testing as tm @@ -132,6 +136,9 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) def test_data_frame_value_counts_dropna_false(nulls_fixture): # GH 41334 df = pd.DataFrame( diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index e8ef0592ac432..f8219e68a72da 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -8,6 +8,8 @@ from pandas._config import using_string_dtype from pandas._config.config import option_context +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -113,7 +115,9 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) - @pytest.mark.xfail(using_string_dtype(), reason="surrogates not allowed") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="surrogates not allowed" + ) def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 734bfc8b30053..e41a3b27e592c 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -13,6 +13,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -1542,7 +1544,9 @@ def test_comparisons(self, simple_frame, float_frame, func): with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): # GH 11565 df = DataFrame( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a210af94561f9..0176a36fe78d7 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,6 +24,7 @@ from pandas._config import using_string_dtype from pandas._libs import lib +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError @@ -299,7 +300,7 @@ def test_constructor_dtype_nocast_view_2d_array(self): df2 = DataFrame(df.values, dtype=df[0].dtype) assert df2._mgr.blocks[0].values.flags.c_contiguous - @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index ad54cfaf9d927..6788721e8a72e 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -4,6 +4,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + from pandas import ( CategoricalIndex, DataFrame, @@ -96,6 +100,9 @@ def test_logical_ops_int_frame(self): res_ser = df1a_int["A"] | df1a_bool["A"] tm.assert_series_equal(res_ser, df1a_bool["A"]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_logical_ops_invalid(self, using_infer_string): # GH#5808 diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 4c355ed92b6c3..1d667d35db253 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -226,6 +226,7 @@ def float_frame_with_na(): class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Reductions + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "opname", @@ -431,6 +432,7 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis): expected[expected.isna()] = None tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 @@ -532,7 +534,7 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) with pytest.raises( - TypeError, match="unsupported operand type|does not support" + TypeError, match="unsupported operand type|does not support|Cannot perform" ): df.mean() result = df[["A", "C"]].mean() @@ -690,6 +692,7 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted @@ -979,7 +982,7 @@ def test_sum_mixed_datetime(self): def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - msg = "Could not convert|does not support" + msg = "Could not convert|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): float_string_frame.mean(axis=0) @@ -1093,6 +1096,7 @@ def test_idxmin_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmin_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) @@ -1143,6 +1147,7 @@ def test_idxmax_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmax_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) @@ -1964,7 +1969,7 @@ def test_minmax_extensionarray(method, numeric_only): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with pytest.raises(TypeError, match="does not support operation"): + with pytest.raises(TypeError, match="does not support operation|Cannot perform"): df.sum() diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 1887fa61ad081..5bbe047078c6e 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -5,6 +5,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -42,6 +43,11 @@ def test_neg_object(self, df, expected): tm.assert_frame_equal(-df, expected) tm.assert_series_equal(-df["a"], expected["a"]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) @pytest.mark.parametrize( "df_data", [ @@ -130,7 +136,9 @@ def test_pos_object(self, df_data): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) @pytest.mark.filterwarnings("ignore:Applying:DeprecationWarning") def test_pos_object_raises(self): # GH#21380 diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 14d3dbd6fa496..18802ebd002fc 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -9,6 +9,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas import ( @@ -500,6 +501,9 @@ def test_dropna_combinations( tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, expected_data, expected_index", [ diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 791f279bffc94..11b874d0b1608 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -8,6 +8,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import SpecificationError import pandas.util._test_decorators as td @@ -1407,6 +1408,10 @@ def g(group): tm.assert_series_equal(result, expected) +# TODO harmonize error messages +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) def test_set_group_name(df, grouper, using_infer_string): def f(group): diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d42aa06d6bbfe..02071acf378dd 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -3,6 +3,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -12,6 +13,9 @@ from pandas.tests.groupby import get_groupby_method_args +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -55,6 +59,9 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -131,6 +138,9 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, idx, expected", [ @@ -205,6 +215,9 @@ def test_groupby_dataframe_slice_then_transform(dropna, index): tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -286,6 +299,9 @@ def test_groupby_dropna_datetime_like_data( tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, data, selected_data, levels", [ @@ -371,6 +387,9 @@ def test_groupby_dropna_with_multiindex_input(input_index, keys, series): tm.assert_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_groupby_nan_included(): # GH 35646 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py index c6697fd169e8a..78a79ac7d1546 100644 --- a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py +++ b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py @@ -9,7 +9,6 @@ from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas import ( DatetimeIndex, @@ -69,10 +68,10 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] index = DatetimeIndex(times) tz = "US/Eastern" - with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): + with pytest.raises(ValueError, match="|".join(times)): index.tz_localize(tz=tz) - with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): + with pytest.raises(ValueError, match="|".join(times)): index.tz_localize(tz=tz, nonexistent="raise") result = index.tz_localize(tz=tz, nonexistent="NaT") @@ -85,7 +84,7 @@ def test_dti_tz_localize_ambiguous_infer(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour()) - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + with pytest.raises(ValueError, match="Cannot infer dst time"): dr.tz_localize(tz) def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): @@ -117,7 +116,7 @@ def test_dti_tz_localize_ambiguous_infer3(self, tz): def test_dti_tz_localize_ambiguous_times(self, tz): # March 13, 2011, spring forward, skip from 2 AM to 3 AM dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=offsets.Hour()) - with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:30:00"): + with pytest.raises(ValueError, match="2011-03-13 02:30:00"): dr.tz_localize(tz) # after dst transition, it works @@ -127,7 +126,7 @@ def test_dti_tz_localize_ambiguous_times(self, tz): # November 6, 2011, fall back, repeat 2 AM hour dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=offsets.Hour()) - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + with pytest.raises(ValueError, match="Cannot infer dst time"): dr.tz_localize(tz) # UTC is OK @@ -163,11 +162,11 @@ def test_dti_tz_localize(self, prefix): tm.assert_numpy_array_equal(dti3.values, dti_utc.values) dti = date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="ms") - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + with pytest.raises(ValueError, match="Cannot infer dst time"): dti.tz_localize(tzstr) dti = date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="ms") - with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): + with pytest.raises(ValueError, match="2011-03-13 02:00:00"): dti.tz_localize(tzstr) def test_dti_tz_localize_utc_conversion(self, tz): @@ -184,7 +183,7 @@ def test_dti_tz_localize_utc_conversion(self, tz): # DST ambiguity, this should fail rng = date_range("3/11/2012", "3/12/2012", freq="30min") # Is this really how it should fail?? - with pytest.raises(pytz.NonExistentTimeError, match="2012-03-11 02:00:00"): + with pytest.raises(ValueError, match="2012-03-11 02:00:00"): rng.tz_localize(tz) def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index aba440ceeb56b..8da88b97f9ea8 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -14,7 +14,6 @@ from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( astype_overflowsafe, @@ -750,7 +749,7 @@ def test_disallow_setting_tz(self): [ None, "America/Los_Angeles", - pytz.timezone("America/Los_Angeles"), + zoneinfo.ZoneInfo("America/Los_Angeles"), Timestamp("2000", tz="America/Los_Angeles").tz, ], ) @@ -765,8 +764,8 @@ def test_constructor_start_end_with_tz(self, tz): freq="D", ) tm.assert_index_equal(result, expected) - # Especially assert that the timezone is consistent for pytz - assert pytz.timezone("America/Los_Angeles") is result.tz + # Especially assert that the timezone is consistent for zoneinfo + assert zoneinfo.ZoneInfo("America/Los_Angeles") is result.tz @pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"]) def test_constructor_with_non_normalized_pytz(self, tz): @@ -984,6 +983,7 @@ def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request): # GH#47471 check that we get the same raising behavior in the DTI # constructor and Timestamp constructor if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") tz = pytz.timezone(tz.removeprefix("pytz/")) dtstr = "2013-11-03 01:59:59.999999" item = dtstr @@ -1000,7 +1000,7 @@ def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request): mark = pytest.mark.xfail(reason="We implicitly get fold=0.") request.applymarker(mark) - with pytest.raises(pytz.AmbiguousTimeError, match=dtstr): + with pytest.raises(ValueError, match=dtstr): box_cls(item, tz=tz) @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index b37b5cf74b347..e09883e95ecec 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -11,7 +11,6 @@ import numpy as np import pytest -import pytz from pandas._libs.tslibs import timezones from pandas._libs.tslibs.offsets import ( @@ -881,7 +880,7 @@ def test_date_range_ambiguous_endpoint(self, tz): # construction with an ambiguous end-point # GH#11626 - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + with pytest.raises(ValueError, match="Cannot infer dst time"): date_range( "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="h" ) @@ -905,7 +904,7 @@ def test_date_range_ambiguous_endpoint(self, tz): def test_date_range_nonexistent_endpoint(self, tz, option, expected): # construction with an nonexistent end-point - with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"): + with pytest.raises(ValueError, match="2019-03-10 02:00:00"): date_range( "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="h" ) @@ -1259,6 +1258,24 @@ def test_range_with_timezone_and_custombusinessday(self, start, period, expected expected = DatetimeIndex(expected).as_unit("ns") tm.assert_index_equal(result, expected) + def test_data_range_custombusinessday_partial_time(self, unit): + # GH#57456 + offset = offsets.CustomBusinessDay(weekmask="Sun Mon Tue") + start = datetime(2024, 2, 6, 23) + # end datetime is partial and not in the offset + end = datetime(2024, 2, 14, 14) + result = date_range(start, end, freq=offset, unit=unit) + expected = DatetimeIndex( + [ + "2024-02-06 23:00:00", + "2024-02-11 23:00:00", + "2024-02-12 23:00:00", + "2024-02-13 23:00:00", + ], + dtype=f"M8[{unit}]", + ) + tm.assert_index_equal(result, expected) + class TestDateRangeNonNano: def test_date_range_reso_validation(self): diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index e4b8a909add0d..8d9340818b511 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -184,11 +184,8 @@ def test_dti_tz_nat(self, tzstr): assert isna(idx[1]) assert idx[0].tzinfo is not None - @pytest.mark.parametrize("tzstr", ["pytz/US/Eastern", "dateutil/US/Eastern"]) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_utc_box_timestamp_and_localize(self, tzstr): - if tzstr.startswith("pytz/"): - pytest.importorskip("pytz") - tzstr = tzstr.removeprefix("pytz/") tz = timezones.maybe_get_tz(tzstr) rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") @@ -203,11 +200,10 @@ def test_utc_box_timestamp_and_localize(self, tzstr): # right tzinfo rng = date_range("3/13/2012", "3/14/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tzstr) - # test not valid for dateutil timezones. - # assert 'EDT' in repr(rng_eastern[0].tzinfo) - assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr( - rng_eastern[0].tzinfo - ) + if "dateutil" in tzstr: + assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr( + rng_eastern[0].tzinfo + ) @pytest.mark.parametrize( "tz", [zoneinfo.ZoneInfo("US/Central"), gettz("US/Central")] diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 1eeeebd6b8ca9..e3428d1060dbe 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,10 +3,13 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.missing import ( NA, is_matching_na, ) +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -29,6 +32,9 @@ def test_get_indexer_strings(self, method, expected): tm.assert_numpy_array_equal(actual, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_get_indexer_strings_raises(self, using_infer_string): index = Index(["b", "c"]) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0911f2aec74d6..7ec66100b7291 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -8,7 +8,12 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas._config import using_string_dtype + +from pandas.compat import ( + HAS_PYARROW, + IS64, +) from pandas.errors import InvalidIndexError import pandas.util._test_decorators as td @@ -71,6 +76,9 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_constructor_copy(self, using_infer_string): index = Index(list("abc"), name="name") arr = np.array(index) @@ -335,6 +343,11 @@ def test_constructor_empty_special(self, empty, klass): def test_view_with_args(self, index): index.view("i8") + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) @pytest.mark.parametrize( "index", [ @@ -817,6 +830,11 @@ def test_isin(self, values, index, expected): expected = np.array(expected, dtype=bool) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_isin_nan_common_object( self, nulls_fixture, nulls_fixture2, using_infer_string ): diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 6d01ba6adc87a..9993a21d93f12 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -9,6 +9,7 @@ from pandas._config import using_string_dtype from pandas._libs.tslibs import Timestamp +from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ( is_integer_dtype, @@ -245,6 +246,11 @@ def test_repr_max_seq_item_setting(self, simple_index): repr(idx) assert "..." not in str(idx) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 247501f1504e7..e007b8c4e97ac 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -16,6 +16,7 @@ from pandas._config import using_string_dtype from pandas._libs import index as libindex +from pandas.compat import HAS_PYARROW from pandas.errors import IndexingError import pandas as pd @@ -1388,6 +1389,9 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 65a52bc8e0794..b831ec3bb2c6a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -30,10 +30,6 @@ read_csv, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ @@ -692,43 +688,33 @@ def test_dtype_backend_and_dtype(self, read_ext, tmp_excel): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_string_dtype(), reason="infer_string takes precedence", strict=False - ) def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") - pa = pytest.importorskip("pyarrow") + df = DataFrame( + { + "a": np.array(["a", "b"], dtype=np.object_), + "b": np.array(["x", pd.NA], dtype=np.object_), + } + ) + df.to_excel(tmp_excel, sheet_name="test", index=False) with pd.option_context("mode.string_storage", string_storage): - df = DataFrame( - { - "a": np.array(["a", "b"], dtype=np.object_), - "b": np.array(["x", pd.NA], dtype=np.object_), - } - ) - df.to_excel(tmp_excel, sheet_name="test", index=False) result = pd.read_excel( tmp_excel, sheet_name="test", dtype_backend="numpy_nullable" ) - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": Series(["x", None], dtype=pd.StringDtype(string_storage)), + } + ) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index e1cdfb8bfa7e3..44266ae9a62a5 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -282,7 +282,6 @@ def test_excel_multindex_roundtrip( ) tm.assert_frame_equal(df, act, check_names=check_names) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_read_excel_parse_dates(self, tmp_excel): # see gh-11544, gh-12051 df = DataFrame( diff --git a/pandas/tests/io/formats/style/test_bar.py b/pandas/tests/io/formats/style/test_bar.py index b0e4712e8bb3d..d28c7c566d851 100644 --- a/pandas/tests/io/formats/style/test_bar.py +++ b/pandas/tests/io/formats/style/test_bar.py @@ -347,6 +347,7 @@ def test_styler_bar_with_NA_values(): def test_style_bar_with_pyarrow_NA_values(): + pytest.importorskip("pyarrow") data = """name,age,test1,test2,teacher Adam,15,95.0,80,Ashby Bob,16,81.0,82,Ashby diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1bc227369a968..3d07c0219691e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -28,11 +28,6 @@ read_json, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.json import ujson_dumps @@ -2143,12 +2138,10 @@ def test_json_uint64(self): result = df.to_json(orient="split") assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_json_dtype_backend( self, string_storage, dtype_backend, orient, using_infer_string ): # GH#50750 - pa = pytest.importorskip("pyarrow") df = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2162,30 +2155,18 @@ def test_read_json_dtype_backend( } ) - if using_infer_string: - string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None])) - elif string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): result = read_json( StringIO(out), dtype_backend=dtype_backend, orient=orient ) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2194,12 +2175,13 @@ def test_read_json_dtype_backend( "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray expected = DataFrame( @@ -2212,7 +2194,9 @@ def test_read_json_dtype_backend( if orient == "values": expected.columns = list(range(8)) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.parametrize("orient", ["split", "records", "index"]) def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 6d5f870f07206..90f77a7024235 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -4,6 +4,7 @@ import pytest +from pandas.compat import HAS_PYARROW from pandas.compat._optional import VERSIONS from pandas import ( @@ -117,7 +118,15 @@ def csv1(datapath): _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)] +_pyarrow_parsers_only = [ + pytest.param( + _pyarrowParser, + marks=[ + pytest.mark.single_cpu, + pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"), + ], + ) +] _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] @@ -181,7 +190,16 @@ def _get_all_parser_float_precision_combinations(): parser = parser.values[0] for precision in parser.float_precision_choices: # Re-wrap in pytest.param for pyarrow - mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else () + mark = ( + [ + pytest.mark.single_cpu, + pytest.mark.skipif( + not HAS_PYARROW, reason="pyarrow is not installed" + ), + ] + if parser.engine == "pyarrow" + else () + ) param = pytest.param((parser(), precision), marks=mark) params.append(param) ids.append(f"{parser_id}-{precision}") diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 3f410a13c8f80..07f29518b7881 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -19,11 +19,7 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - IntegerArray, - StringArray, -) +from pandas.core.arrays import IntegerArray pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" @@ -463,11 +459,8 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 - pa = pytest.importorskip("pyarrow") - with pd.option_context("mode.string_storage", string_storage): parser = all_parsers @@ -477,21 +470,13 @@ def test_dtype_backend_string(all_parsers, string_storage): """ result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)), + }, + ) + tm.assert_frame_equal(result, expected) def test_dtype_backend_ea_dtype_specified(all_parsers): @@ -507,7 +492,6 @@ def test_dtype_backend_ea_dtype_specified(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index b7b4a77c9e048..6243185294894 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -13,8 +13,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import EmptyDataError import pandas as pd @@ -23,10 +21,6 @@ DatetimeIndex, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.common import urlopen from pandas.io.parsers import ( @@ -941,39 +935,30 @@ def test_widths_and_usecols(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(string_storage, dtype_backend): # GH#50289 - if string_storage == "python": - arr = StringArray(np.array(["a", "b"], dtype=np.object_)) - arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - arr = ArrowExtensionArray(pa.array(["a", "b"])) - arr_na = ArrowExtensionArray(pa.array([None, "a"])) - else: - pa = pytest.importorskip("pyarrow") - arr = ArrowStringArray(pa.array(["a", "b"])) - arr_na = ArrowStringArray(pa.array([None, "a"])) - data = """a b c d e f g h i 1 2.5 True a 3 4.5 False b True 6 7.5 a""" with pd.option_context("mode.string_storage", string_storage): result = read_fwf(StringIO(data), dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": pd.Series([1, 3], dtype="Int64"), "b": pd.Series([2.5, 4.5], dtype="Float64"), "c": pd.Series([True, False], dtype="boolean"), - "d": arr, + "d": pd.Series(["a", "b"], dtype=string_dtype), "e": pd.Series([pd.NA, True], dtype="boolean"), "f": pd.Series([pd.NA, 6], dtype="Int64"), "g": pd.Series([pd.NA, 7.5], dtype="Float64"), - "h": arr_na, + "h": pd.Series([None, "a"], dtype=string_dtype), "i": pd.Series([pd.NA, pd.NA], dtype="Int64"), } ) @@ -989,7 +974,9 @@ def test_dtype_backend(string_storage, dtype_backend): ) expected["i"] = ArrowExtensionArray(pa.array([None, None])) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) def test_invalid_dtype_backend(): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 923b880004c26..541cc39606047 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -19,10 +19,6 @@ read_clipboard, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.clipboard import ( CheckedCall, @@ -358,23 +354,15 @@ def test_read_clipboard_dtype_backend( self, clipboard, string_storage, dtype_backend, engine ): # GH#50502 - if string_storage == "pyarrow" or dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - - if string_storage == "python": - string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) - string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow" and engine != "c": + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["x", "y"])) - string_array_na = ArrowExtensionArray(pa.array(["x", None])) - + if engine == "c" and string_storage == "pyarrow": + # TODO avoid this exception? + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) else: - string_array = ArrowStringArray(pa.array(["x", "y"])) - string_array_na = ArrowStringArray(pa.array(["x", None])) + string_dtype = pd.StringDtype(string_storage) text = """a,b,c,d,e,f,g,h,i x,1,4.0,x,2,4.0,,True,False @@ -386,10 +374,10 @@ def test_read_clipboard_dtype_backend( expected = DataFrame( { - "a": string_array, + "a": Series(["x", "y"], dtype=string_dtype), "b": Series([1, 2], dtype="Int64"), "c": Series([4.0, 5.0], dtype="Float64"), - "d": string_array_na, + "d": Series(["x", None], dtype=string_dtype), "e": Series([2, NA], dtype="Int64"), "f": Series([4.0, NA], dtype="Float64"), "g": Series([NA, NA], dtype="Int64"), diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 5aa8f1c69fe44..6dd4368f09cc8 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -9,10 +9,6 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.feather_format import read_feather, to_feather # isort:skip @@ -184,25 +180,17 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - with tm.ensure_clean() as path: to_feather(df, path) with pd.option_context("mode.string_storage", string_storage): result = read_feather(path, dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = pd.DataFrame( { "a": pd.Series([1, np.nan, 3], dtype="Int64"), @@ -211,8 +199,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"), "e": pd.Series([True, False, pd.NA], dtype="boolean"), "f": pd.Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": pd.Series(["a", "b", "c"], dtype=string_dtype), + "h": pd.Series(["a", "b", None], dtype=string_dtype), } ) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 164646aedf464..73e9933e3681b 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -13,8 +13,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -31,17 +29,9 @@ to_datetime, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.common import file_path_to_url -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture( params=[ @@ -156,7 +146,7 @@ def test_to_html_compat(self, flavor_read_html): df = ( DataFrame( np.random.default_rng(2).random((4, 3)), - columns=pd.Index(list("abc"), dtype=object), + columns=pd.Index(list("abc")), ) .map("{:.3f}".format) .astype(float) @@ -182,24 +172,16 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0] + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -208,8 +190,8 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) @@ -225,7 +207,9 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): } ) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.network @pytest.mark.single_cpu diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 561c718ea5851..ec087eab0cf14 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -17,6 +17,7 @@ pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, + pa_version_under18p0, ) import pandas as pd @@ -955,11 +956,16 @@ def test_timestamp_nanoseconds(self, pa): def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): pytest.importorskip("pyarrow", "11.0.0") - if timezone_aware_date_list.tzinfo != datetime.timezone.utc: + if ( + timezone_aware_date_list.tzinfo != datetime.timezone.utc + and pa_version_under18p0 + ): request.applymarker( pytest.mark.xfail( - reason="temporary skip this test until it is properly resolved: " - "https://github.com/pandas-dev/pandas/issues/37286" + reason=( + "pyarrow returns pytz.FixedOffset while pandas " + "constructs datetime.timezone https://github.com/pandas-dev/pandas/issues/37286" + ) ) ) idx = 5 * [timezone_aware_date_list] @@ -1172,6 +1178,10 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) + @pytest.mark.xfail( + Version(np.__version__) >= Version("2.0.0"), + reason="fastparquet uses np.float_ in numpy2", + ) def test_bool_with_none(self, fp): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a21893f66722a..980c88f070b89 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -39,10 +39,6 @@ to_timedelta, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.util.version import Version from pandas.io import sql @@ -3661,24 +3657,13 @@ def dtype_backend_data() -> DataFrame: @pytest.fixture def dtype_backend_expected(): - def func(storage, dtype_backend, conn_name) -> DataFrame: - string_array: StringArray | ArrowStringArray - string_array_na: StringArray | ArrowStringArray - if storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": + def func(string_storage, dtype_backend, conn_name) -> DataFrame: + string_dtype: pd.StringDtype | pd.ArrowDtype + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment] - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment] - + string_dtype = pd.ArrowDtype(pa.string()) else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + string_dtype = pd.StringDtype(string_storage) df = DataFrame( { @@ -3688,8 +3673,8 @@ def func(storage, dtype_backend, conn_name) -> DataFrame: "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, pd.NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) if dtype_backend == "pyarrow": diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 036a5d6265dd7..5c07a56c9fb3f 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,8 +14,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import WASM from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -31,11 +29,6 @@ Series, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -1992,7 +1985,6 @@ def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): tm.assert_frame_equal(df_lxml, df_etree) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_xml_nullable_dtypes( parser, string_storage, dtype_backend, using_infer_string ): @@ -2023,36 +2015,21 @@ def test_read_xml_nullable_dtypes( """ - if using_infer_string: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"])) - string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None])) - - elif string_storage == "python": - string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) - string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + with pd.option_context("mode.string_storage", string_storage): + result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) - elif dtype_backend == "pyarrow": + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["x", "y"])) - string_array_na = ArrowExtensionArray(pa.array(["x", None])) - + string_dtype = pd.ArrowDtype(pa.string()) else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["x", "y"])) - string_array_na = ArrowStringArray(pa.array(["x", None])) - - with pd.option_context("mode.string_storage", string_storage): - result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) + string_dtype = pd.StringDtype(string_storage) expected = DataFrame( { - "a": string_array, + "a": Series(["x", "y"], dtype=string_dtype), "b": Series([1, 2], dtype="Int64"), "c": Series([4.0, 5.0], dtype="Float64"), - "d": string_array_na, + "d": Series(["x", None], dtype=string_dtype), "e": Series([2, NA], dtype="Int64"), "f": Series([4.0, NA], dtype="Float64"), "g": Series([NA, NA], dtype="Int64"), @@ -2073,7 +2050,9 @@ def test_read_xml_nullable_dtypes( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) def test_invalid_dtype_backend(): diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index b381c4fce8430..b39f953da1ee6 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -45,6 +45,7 @@ _check_visible, get_y_axis, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -2465,8 +2466,14 @@ def test_group_subplot_invalid_column_name(self): d = {"a": np.arange(10), "b": np.arange(10)} df = DataFrame(d) - with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"): - df.plot(subplots=[("a", "bad_name")]) + if Version(np.__version__) < Version("2.0.0"): + with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"): + df.plot(subplots=[("a", "bad_name")]) + else: + with pytest.raises( + ValueError, match=r"Column label\(s\) \[np\.str\_\('bad_name'\)\]" + ): + df.plot(subplots=[("a", "bad_name")]) def test_group_subplot_duplicated_column(self): d = {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 66799732be064..26fecef6ed0e6 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -9,6 +9,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( Categorical, @@ -1204,6 +1206,9 @@ def test_idxminmax_object_dtype(self, using_infer_string): with pytest.raises(TypeError, match=msg): ser3.idxmin(skipna=False) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_idxminmax_object_frame(self): # GH#4279 df = DataFrame([["zimm", 2.5], ["biff", 1.0], ["bid", 12.0]]) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index b2caa1fadd1a5..8af224f1ad64f 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -939,3 +939,14 @@ def test_concat_with_series_and_frame_returns_rangeindex_columns(): result = concat([ser, df]) expected = DataFrame([0, 1, 2], index=[0, 0, 1]) tm.assert_frame_equal(result, expected, check_column_type=True) + + +def test_concat_with_moot_ignore_index_and_keys(): + df1 = DataFrame([[0]]) + df2 = DataFrame([[42]]) + + ignore_index = True + keys = ["df1", "df2"] + msg = f"Cannot set {ignore_index=} and specify keys. Either should be used." + with pytest.raises(ValueError, match=msg): + concat([df1, df2], keys=keys, ignore_index=ignore_index) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ad704d87a491b..cbee85f4aede9 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2998,3 +2998,15 @@ def test_merge_datetime_and_timedelta(how): ) with pytest.raises(ValueError, match=re.escape(msg)): right.merge(left, on="key", how=how) + + +def test_merge_on_all_nan_column(): + # GH#59421 + left = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6]}) + right = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "zz": [4, 5, 6]}) + result = left.merge(right, on=["x", "y"], how="outer") + # Should not trigger array bounds eerror with bounds checking or asan enabled. + expected = DataFrame( + {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 5f769db7f8acf..b2e9f26e1c407 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -307,3 +307,15 @@ def test_qcut_nullable_integer(q, any_numeric_ea_dtype): expected = qcut(arr.astype(float), q) tm.assert_categorical_equal(result, expected) + + +@pytest.mark.parametrize("scale", [1.0, 1 / 3, 17.0]) +@pytest.mark.parametrize("q", [3, 7, 9]) +@pytest.mark.parametrize("precision", [1, 3, 16]) +def test_qcut_contains(scale, q, precision): + # GH-59355 + arr = (scale * np.arange(q + 1)).round(precision) + result = qcut(arr, q, precision=precision) + + for value, bucket in zip(arr, result): + assert value in bucket diff --git a/pandas/tests/scalar/timestamp/methods/test_round.py b/pandas/tests/scalar/timestamp/methods/test_round.py index 2fb0e1a8d3397..944aa55727217 100644 --- a/pandas/tests/scalar/timestamp/methods/test_round.py +++ b/pandas/tests/scalar/timestamp/methods/test_round.py @@ -4,7 +4,6 @@ ) import numpy as np import pytest -import pytz from pandas._libs import lib from pandas._libs.tslibs import ( @@ -182,7 +181,7 @@ def test_round_dst_border_ambiguous(self, method, unit): assert result is NaT msg = "Cannot infer dst time" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): + with pytest.raises(ValueError, match=msg): getattr(ts, method)("h", ambiguous="raise") @pytest.mark.parametrize( @@ -205,7 +204,7 @@ def test_round_dst_border_nonexistent(self, method, ts_str, freq, unit): assert result is NaT msg = "2018-03-11 02:00:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): + with pytest.raises(ValueError, match=msg): getattr(ts, method)(freq, nonexistent="raise") @pytest.mark.parametrize( diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py index 90dc8d77608cb..cb7ac5fa6f1da 100644 --- a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py +++ b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py @@ -4,11 +4,6 @@ from dateutil.tz import gettz import pytest -import pytz -from pytz.exceptions import ( - AmbiguousTimeError, - NonExistentTimeError, -) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas.errors import OutOfBoundsDatetime @@ -54,13 +49,14 @@ def test_tz_localize_ambiguous_bool(self, unit, tz): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") tz = pytz.timezone(tz.removeprefix("pytz/")) ts = Timestamp("2015-11-01 01:00:03").as_unit(unit) expected0 = Timestamp("2015-11-01 01:00:03-0500", tz=tz) expected1 = Timestamp("2015-11-01 01:00:03-0600", tz=tz) msg = "Cannot infer dst time from 2015-11-01 01:00:03" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): + with pytest.raises(ValueError, match=msg): ts.tz_localize(tz) result = ts.tz_localize(tz, ambiguous=True) @@ -105,10 +101,10 @@ def test_tz_localize_ambiguous(self): def test_tz_localize_nonexistent(self, stamp, tz): # GH#13057 ts = Timestamp(stamp) - with pytest.raises(NonExistentTimeError, match=stamp): + with pytest.raises(ValueError, match=stamp): ts.tz_localize(tz) # GH 22644 - with pytest.raises(NonExistentTimeError, match=stamp): + with pytest.raises(ValueError, match=stamp): ts.tz_localize(tz, nonexistent="raise") assert ts.tz_localize(tz, nonexistent="NaT") is NaT @@ -154,7 +150,7 @@ def test_tz_localize_ambiguous_raise(self): # GH#13057 ts = Timestamp("2015-11-1 01:00") msg = "Cannot infer dst time from 2015-11-01 01:00:00," - with pytest.raises(AmbiguousTimeError, match=msg): + with pytest.raises(ValueError, match=msg): ts.tz_localize("US/Pacific", ambiguous="raise") def test_tz_localize_nonexistent_invalid_arg(self, warsaw): @@ -330,7 +326,7 @@ def test_timestamp_tz_localize_nonexistent_raise(self, warsaw, unit): tz = warsaw ts = Timestamp("2015-03-29 02:20:00").as_unit(unit) msg = "2015-03-29 02:20:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): + with pytest.raises(ValueError, match=msg): ts.tz_localize(tz, nonexistent="raise") msg = ( "The nonexistent argument must be one of 'raise', 'NaT', " diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 39f302c3357de..2c97c4a32e0aa 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -15,7 +15,6 @@ ) import numpy as np import pytest -import pytz from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas.errors import OutOfBoundsDatetime @@ -747,7 +746,7 @@ def test_constructor_tz_or_tzinfo(self): tz="UTC", ), Timestamp(2000, 1, 2, 3, 4, 5, 6, None, nanosecond=1), - Timestamp(2000, 1, 2, 3, 4, 5, 6, tz=pytz.UTC, nanosecond=1), + Timestamp(2000, 1, 2, 3, 4, 5, 6, tz=timezone.utc, nanosecond=1), ], ) def test_constructor_nanosecond(self, result): @@ -904,7 +903,7 @@ def test_raise_tz_and_tzinfo_in_datetime_input(self, box): Timestamp(box(**kwargs), tz="US/Pacific") msg = "Cannot pass a datetime or Timestamp" with pytest.raises(ValueError, match=msg): - Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) + Timestamp(box(**kwargs), tzinfo=zoneinfo.ZoneInfo("US/Pacific")) def test_dont_convert_dateutil_utc_to_default_utc(self): result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) @@ -948,7 +947,7 @@ def test_timestamp_constructor_near_dst_boundary(self): assert result == expected msg = "Cannot infer dst time from 2015-10-25 02:00:00" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): + with pytest.raises(ValueError, match=msg): Timestamp("2015-10-25 02:00", tz=tz) result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") @@ -956,7 +955,7 @@ def test_timestamp_constructor_near_dst_boundary(self): assert result == expected msg = "2017-03-26 02:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): + with pytest.raises(ValueError, match=msg): Timestamp("2017-03-26 02:00", tz="Europe/Paris") # GH#11708 @@ -975,7 +974,7 @@ def test_timestamp_constructor_near_dst_boundary(self): assert result == expected msg = "2017-03-26 02:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): + with pytest.raises(ValueError, match=msg): Timestamp("2017-03-26 02:00", tz="Europe/Paris") result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris") diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 3e1ece6b7f59e..9b9a8ea3600ae 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -9,7 +9,6 @@ import numpy as np import pytest -import pytz from pandas._config import using_string_dtype @@ -28,6 +27,7 @@ Period, PeriodIndex, Series, + StringDtype, TimedeltaIndex, date_range, period_range, @@ -352,7 +352,7 @@ def test_dt_round_tz_ambiguous(self, method): tm.assert_series_equal(result, expected) # raise - with tm.external_error_raised(pytz.AmbiguousTimeError): + with tm.external_error_raised(ValueError): getattr(df1.date.dt, method)("h", ambiguous="raise") @pytest.mark.parametrize( @@ -374,7 +374,7 @@ def test_dt_round_tz_nonexistent(self, method, ts_str, freq): expected = Series([pd.NaT]).dt.tz_localize(result.dt.tz) tm.assert_series_equal(result, expected) - with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): + with pytest.raises(ValueError, match="2018-03-11 02:00:00"): getattr(ser.dt, method)(freq, nonexistent="raise") @pytest.mark.parametrize("freq", ["ns", "us", "1000us"]) @@ -514,7 +514,6 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1]) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime(self): # GH 10086 ser = Series(date_range("20130101", periods=5)) @@ -585,10 +584,9 @@ def test_strftime_period_days(self, using_infer_string): dtype="=U10", ) if using_infer_string: - expected = expected.astype("str") + expected = expected.astype(StringDtype(na_value=np.nan)) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_microsecond_resolution(self): ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) result = ser.dt.strftime("%Y-%m-%d %H:%M:%S") @@ -621,7 +619,6 @@ def test_strftime_period_minutes(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data", [ @@ -644,7 +641,7 @@ def test_strftime_all_nat(self, data): ser = Series(data) with tm.assert_produces_warning(None): result = ser.dt.strftime("%Y-%m-%d") - expected = Series([np.nan], dtype=object) + expected = Series([np.nan], dtype="str") tm.assert_series_equal(result, expected) def test_valid_dt_with_missing_values(self): diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 742091d761d62..71ba2dab671ef 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -10,6 +10,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError @@ -822,6 +823,11 @@ def test_mask_key(self, obj, key, expected, raises, val, indexer_sli): else: indexer_sli(obj)[mask] = val + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_series_where(self, obj, key, expected, raises, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 4a8af259b4134..90c4056a39e84 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib import pandas as pd @@ -12,7 +10,6 @@ class TestSeriesConvertDtypes: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, maindtype, expected_default, expected_other", [ @@ -223,9 +220,9 @@ def test_convert_dtypes( and params[0] and not params[1] ): - # If we would convert with convert strings then infer_objects converts - # with the option - expected_dtype = "string[pyarrow_numpy]" + # If convert_string=False and infer_objects=True, we end up with the + # default string dtype instead of preserving object for string data + expected_dtype = pd.StringDtype(na_value=np.nan) expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_tz_localize.py b/pandas/tests/series/methods/test_tz_localize.py index 45620a721f442..53288e8a1f8e7 100644 --- a/pandas/tests/series/methods/test_tz_localize.py +++ b/pandas/tests/series/methods/test_tz_localize.py @@ -1,7 +1,6 @@ from datetime import timezone import pytest -import pytz from pandas._libs.tslibs import timezones @@ -28,7 +27,7 @@ def test_series_tz_localize_ambiguous_bool(self): expected0 = Series([expected0]) expected1 = Series([expected1]) - with tm.external_error_raised(pytz.AmbiguousTimeError): + with tm.external_error_raised(ValueError): ser.dt.tz_localize("US/Central") result = ser.dt.tz_localize("US/Central", ambiguous=True) @@ -79,11 +78,11 @@ def test_tz_localize_nonexistent(self, warsaw, method, exp, unit): df = ser.to_frame() if method == "raise": - with tm.external_error_raised(pytz.NonExistentTimeError): + with tm.external_error_raised(ValueError): dti.tz_localize(tz, nonexistent=method) - with tm.external_error_raised(pytz.NonExistentTimeError): + with tm.external_error_raised(ValueError): ser.tz_localize(tz, nonexistent=method) - with tm.external_error_raised(pytz.NonExistentTimeError): + with tm.external_error_raised(ValueError): df.tz_localize(tz, nonexistent=method) elif exp == "invalid": diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index a63ffbbd3a5a1..79a55eb357f87 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -4,6 +4,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -160,6 +164,9 @@ def test_attrs(self): result = s + 1 assert result.attrs == {"version": 1} + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_inspect_getmembers(self): # GH38782 pytest.importorskip("jinja2") diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 939bf888fd61b..baed3ba936699 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -6,7 +6,10 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( + ArrowDtype, DataFrame, Index, Series, @@ -143,6 +146,9 @@ def test_logical_operators_int_dtype_with_bool(self): expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_logical_operators_int_dtype_with_object(self, using_infer_string): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") @@ -518,18 +524,38 @@ def test_int_dtype_different_index_not_bool(self): result = ser1 ^ ser2 tm.assert_series_equal(result, expected) + # TODO: this belongs in comparison tests def test_pyarrow_numpy_string_invalid(self): # GH#56008 - pytest.importorskip("pyarrow") + pa = pytest.importorskip("pyarrow") ser = Series([False, True]) ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") result = ser == ser2 - expected = Series(False, index=ser.index) - tm.assert_series_equal(result, expected) + expected_eq = Series(False, index=ser.index) + tm.assert_series_equal(result, expected_eq) result = ser != ser2 - expected = Series(True, index=ser.index) - tm.assert_series_equal(result, expected) + expected_ne = Series(True, index=ser.index) + tm.assert_series_equal(result, expected_ne) with pytest.raises(TypeError, match="Invalid comparison"): ser > ser2 + + # GH#59505 + ser3 = ser2.astype("string[pyarrow]") + result3_eq = ser3 == ser + tm.assert_series_equal(result3_eq, expected_eq.astype("bool[pyarrow]")) + result3_ne = ser3 != ser + tm.assert_series_equal(result3_ne, expected_ne.astype("bool[pyarrow]")) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser3 + + ser4 = ser2.astype(ArrowDtype(pa.string())) + result4_eq = ser4 == ser + tm.assert_series_equal(result4_eq, expected_eq.astype("bool[pyarrow]")) + result4_ne = ser4 != ser + tm.assert_series_equal(result4_ne, expected_ne.astype("bool[pyarrow]")) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser4 diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 0bc3092d30b43..7bbb902e14a36 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import Series import pandas._testing as tm @@ -162,6 +166,9 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_mean_with_convertible_string_raises(using_infer_string): # GH#44008 ser = Series(["1", "2"]) @@ -181,6 +188,9 @@ def test_mean_with_convertible_string_raises(using_infer_string): df.mean() +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_mean_dont_convert_j_to_complex(): # GH#36703 df = pd.DataFrame([{"db": "J", "numeric": 123}]) @@ -199,6 +209,9 @@ def test_mean_dont_convert_j_to_complex(): np.mean(df["db"].astype("string").array) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_median_with_convertible_string_raises(): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index ee26fdae74960..18df76ddd8ed8 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -218,7 +218,7 @@ def test_missing_required_dependency(): subprocess.check_output(call, stderr=subprocess.STDOUT) output = exc.value.stdout.decode() - for name in ["numpy", "pytz", "dateutil"]: + for name in ["numpy", "dateutil"]: assert name in output diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 2a225bda953cf..869d41efa6c28 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -408,6 +408,13 @@ def test_codes_out_of_bound(self): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) + @pytest.mark.parametrize("codes", [[-1, -1], [2, -1], [2, 2]]) + def test_codes_empty_array_out_of_bound(self, codes): + empty_values = np.array([]) + expected_codes = -np.ones_like(codes, dtype=np.intp) + _, result_codes = safe_sort(empty_values, codes) + tm.assert_numpy_array_equal(result_codes, expected_codes) + def test_mixed_integer(self): values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) result = safe_sort(values) diff --git a/pandas/tests/tseries/offsets/test_dst.py b/pandas/tests/tseries/offsets/test_dst.py index dfdc69c0fe18e..e75958843040d 100644 --- a/pandas/tests/tseries/offsets/test_dst.py +++ b/pandas/tests/tseries/offsets/test_dst.py @@ -108,13 +108,13 @@ def _test_offset( "second": "2013-11-03 01:59:01.999999", "microsecond": "2013-11-03 01:59:59.000001", }[offset_name] - with pytest.raises(pytz.AmbiguousTimeError, match=err_msg): + with pytest.raises(ValueError, match=err_msg): tstart + offset # While we're here, let's check that we get the same behavior in a # vectorized path dti = DatetimeIndex([tstart]) warn_msg = "Non-vectorized DateOffset" - with pytest.raises(pytz.AmbiguousTimeError, match=err_msg): + with pytest.raises(ValueError, match=err_msg): with tm.assert_produces_warning(performance_warning, match=warn_msg): dti + offset return @@ -256,10 +256,10 @@ def test_all_offset_classes(self, tup): ], ) def test_nontick_offset_with_ambiguous_time_error(original_dt, target_dt, offset, tz): - # .apply for non-Tick offsets throws AmbiguousTimeError when the target dt + # .apply for non-Tick offsets throws ValueError when the target dt # is dst-ambiguous - localized_dt = original_dt.tz_localize(pytz.timezone(tz)) + localized_dt = original_dt.tz_localize(tz) msg = f"Cannot infer dst time from {target_dt}, try using the 'ambiguous' argument" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): + with pytest.raises(ValueError, match=msg): localized_dt + offset diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 99a6a583dd3e9..943434e515828 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -13,7 +13,6 @@ given, ) import pytest -import pytz import pandas as pd from pandas._testing._hypothesis import ( @@ -34,11 +33,11 @@ def test_on_offset_implementations(dt, offset): # (dt + offset) - offset == dt try: compare = (dt + offset) - offset - except (pytz.NonExistentTimeError, pytz.AmbiguousTimeError): + except ValueError: # When dt + offset does not exist or is DST-ambiguous, assume(False) to # indicate to hypothesis that this is not a valid test case # DST-ambiguous example (GH41906): - # dt = datetime.datetime(1900, 1, 1, tzinfo=pytz.timezone('Africa/Kinshasa')) + # dt = datetime.datetime(1900, 1, 1, tzinfo=ZoneInfo('Africa/Kinshasa')) # offset = MonthBegin(66) assume(False) diff --git a/pandas/tests/tslibs/test_tzconversion.py b/pandas/tests/tslibs/test_tzconversion.py index c1a56ffb71b02..f32829b4e0b21 100644 --- a/pandas/tests/tslibs/test_tzconversion.py +++ b/pandas/tests/tslibs/test_tzconversion.py @@ -1,6 +1,7 @@ +import zoneinfo + import numpy as np import pytest -import pytz from pandas._libs.tslibs.tzconversion import tz_localize_to_utc @@ -11,13 +12,15 @@ def test_tz_localize_to_utc_ambiguous_infer(self): val = 1_320_541_200_000_000_000 vals = np.array([val, val - 1, val], dtype=np.int64) - with pytest.raises(pytz.AmbiguousTimeError, match="2011-11-06 01:00:00"): - tz_localize_to_utc(vals, pytz.timezone("US/Eastern"), ambiguous="infer") + with pytest.raises(ValueError, match="2011-11-06 01:00:00"): + tz_localize_to_utc(vals, zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer") - with pytest.raises(pytz.AmbiguousTimeError, match="are no repeated times"): - tz_localize_to_utc(vals[:1], pytz.timezone("US/Eastern"), ambiguous="infer") + with pytest.raises(ValueError, match="are no repeated times"): + tz_localize_to_utc( + vals[:1], zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer" + ) vals[1] += 1 msg = "There are 2 dst switches when there should only be 1" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - tz_localize_to_utc(vals, pytz.timezone("US/Eastern"), ambiguous="infer") + with pytest.raises(ValueError, match=msg): + tz_localize_to_utc(vals, zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer") diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index af3194b5085c4..17b92427f0d5d 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -6,7 +6,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( + HAS_PYARROW, IS64, is_platform_arm, is_platform_power, @@ -1326,6 +1329,9 @@ def test_rolling_corr_timedelta_index(index, window): tm.assert_almost_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_groupby_rolling_nan_included(): # GH 35542 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 7e18ebe40cfa8..bd20660bdbba6 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -67,7 +67,6 @@ def _get_dependency_info() -> dict[str, JSONSerializable]: "pandas", # required "numpy", - "pytz", "dateutil", # install / build, "pip", diff --git a/pyproject.toml b/pyproject.toml index cc5cc1cf84d0c..645ded35f3d18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,6 @@ dependencies = [ "numpy>=1.23.5; python_version<'3.12'", "numpy>=1.26.0; python_version>='3.12'", "python-dateutil>=2.8.2", - "pytz>=2020.1", "tzdata>=2022.7" ] classifiers = [ @@ -81,6 +80,7 @@ plot = ['matplotlib>=3.6.3'] output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.9.0'] clipboard = ['PyQt5>=5.15.9', 'qtpy>=2.3.0'] compression = ['zstandard>=0.19.0'] +timezone = ['pytz>=2023.4'] all = ['adbc-driver-postgresql>=0.10.0', 'adbc-driver-sqlite>=0.8.0', 'beautifulsoup4>=4.11.2', @@ -107,6 +107,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0', 'python-calamine>=0.1.7', + 'pytz>=2023.4', 'pyxlsb>=1.0.10', 'qtpy>=2.3.0', 'scipy>=1.10.0', diff --git a/requirements-dev.txt b/requirements-dev.txt index dbfd7c6bf7bf5..52d2553fc4001 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -15,7 +15,6 @@ PyQt5>=5.15.9 coverage python-dateutil numpy<2 -pytz beautifulsoup4>=4.11.2 blosc bottleneck>=1.3.6 @@ -39,6 +38,7 @@ pymysql>=1.0.2 pyreadstat>=1.2.0 tables>=3.8.0 python-calamine>=0.1.7 +pytz>=2023.4 pyxlsb>=1.0.10 s3fs>=2022.11.0 scipy>=1.10.0 diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index aa4bfc92ce8a8..4c66f28818abd 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -73,8 +73,8 @@
  • - - + +
  • diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 49ece5564c300..c14996211bb8b 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -360,6 +360,13 @@ Deltalake python package lets you access tables stored in JVM. It provides the ``delta_table.to_pyarrow_table().to_pandas()`` method to convert any Delta table into Pandas dataframe. +### [pandas-gbq](https://github.com/googleapis/python-bigquery-pandas) + +pandas-gbq provides high performance reads and writes to and from +[Google BigQuery](https://cloud.google.com/bigquery/). Previously (before version 2.2.0), +these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`. +Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead. + ## Out-of-core ### [Bodo](https://bodo.ai/) @@ -513,6 +520,13 @@ Arrays](https://awkward-array.org/) inside pandas' Series and DataFrame. It also provides an accessor for using awkward functions on Series that are of awkward type. +### [db-dtypes](https://github.com/googleapis/python-db-dtypes-pandas) + +db-dtypes provides an extension types for working with types like +DATE, TIME, and JSON from database systems. This package is used +by pandas-gbq to provide natural dtypes for BigQuery data types without +a natural numpy type. + ### [Pandas-Genomics](https://pandas-genomics.readthedocs.io/en/latest/) Pandas-Genomics provides an extension type and extension array for working