diff --git a/.circleci/config.yml b/.circleci/config.yml
index b6a5a00429d9a..1c4f33925c999 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -56,7 +56,7 @@ jobs:
/opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev
. ~/virtualenvs/pandas-dev/bin/activate
python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1
- python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
+ python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
python -m pip list --no-cache-dir
export PANDAS_CI=1
diff --git a/.devcontainer.json b/.devcontainer.json
index 7c5d009260c64..54ddfa1a130f8 100644
--- a/.devcontainer.json
+++ b/.devcontainer.json
@@ -8,7 +8,6 @@
// Use 'settings' to set *default* container specific settings.json values on container create.
// You can edit these settings after create using File > Preferences > Settings > Remote.
"settings": {
- "terminal.integrated.shell.linux": "/bin/bash",
"python.pythonPath": "/usr/local/bin/python",
"python.formatting.provider": "black",
"python.linting.enabled": true,
diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml
index ceeebfcd1c90c..3eb68bdd2a15c 100644
--- a/.github/actions/setup-conda/action.yml
+++ b/.github/actions/setup-conda/action.yml
@@ -14,3 +14,9 @@ runs:
condarc-file: ci/.condarc
cache-environment: true
cache-downloads: true
+
+ - name: Uninstall pyarrow
+ if: ${{ env.REMOVE_PYARROW == '1' }}
+ run: |
+ micromamba remove -y pyarrow
+ shell: bash -el {0}
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index a085d0265a1a5..68b7573f01501 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -29,6 +29,7 @@ jobs:
env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
# Prevent the include jobs from overriding other jobs
pattern: [""]
+ pandas_future_infer_string: ["0"]
include:
- name: "Downstream Compat"
env_file: actions-311-downstream_compat.yaml
@@ -58,6 +59,9 @@ jobs:
# It will be temporarily activated during tests with locale.setlocale
extra_loc: "zh_CN"
- name: "Future infer strings"
+ env_file: actions-312.yaml
+ pandas_future_infer_string: "1"
+ - name: "Future infer strings (without pyarrow)"
env_file: actions-311.yaml
pandas_future_infer_string: "1"
- name: "Pypy"
@@ -85,9 +89,10 @@ jobs:
NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }}
# Clipboard tests
QT_QPA_PLATFORM: offscreen
+ REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }}
concurrency:
# https://github.community/t/concurrecy-not-work-for-push/183068/7
- group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}}
+ group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}
cancel-in-progress: true
services:
@@ -231,7 +236,7 @@ jobs:
. ~/virtualenvs/pandas-dev/bin/activate
python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1
python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true"
- python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
+ python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
python -m pip list --no-cache-dir
export PANDAS_CI=1
@@ -269,7 +274,7 @@ jobs:
/opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev
. ~/virtualenvs/pandas-dev/bin/activate
python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1
- python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
+ python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
python -m pip list --no-cache-dir
@@ -290,7 +295,7 @@ jobs:
# In general, this will remain frozen(present, but not running) until:
# - The next unreleased Python version has released beta 1
# - This version should be available on GitHub Actions.
- # - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil)
+ # - Our required build/runtime dependencies(numpy, Cython, python-dateutil)
# support that unreleased Python version.
# To unfreeze, comment out the ``if: false`` condition, and make sure you update
# the name of the workflow and Python version in actions/setup-python ``python-version:``
@@ -343,7 +348,7 @@ jobs:
python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1
python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
python -m pip install versioneer[toml]
- python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov
+ python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov
python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror"
python -m pip list
diff --git a/.gitpod.yml b/.gitpod.yml
index 9222639136a17..9ff349747a33e 100644
--- a/.gitpod.yml
+++ b/.gitpod.yml
@@ -14,7 +14,7 @@ tasks:
cp gitpod/settings.json .vscode/settings.json
git fetch --tags
python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true
- pre-commit install
+ pre-commit install --install-hooks
command: |
python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true
echo "✨ Pre-build complete! You can close this terminal ✨ "
diff --git a/Dockerfile b/Dockerfile
index 0fcbcee92295c..dead3a494e52d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ FROM python:3.10.8
WORKDIR /home/pandas
RUN apt-get update && apt-get -y upgrade
-RUN apt-get install -y build-essential
+RUN apt-get install -y build-essential bash-completion
# hdf5 needed for pytables installation
# libgles2-mesa needed for pytest-qt
@@ -12,4 +12,6 @@ RUN python -m pip install --upgrade pip
COPY requirements-dev.txt /tmp
RUN python -m pip install -r /tmp/requirements-dev.txt
RUN git config --global --add safe.directory /home/pandas
+
+ENV SHELL "/bin/bash"
CMD ["/bin/bash"]
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 973e31815cf63..a9a4daa2e2059 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -70,20 +70,15 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
--format=actions \
-i ES01 `# For now it is ok if docstrings are missing the extended summary` \
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
- -i "pandas.MultiIndex.names SA01" \
-i "pandas.MultiIndex.reorder_levels RT03,SA01" \
- -i "pandas.MultiIndex.sortlevel PR07,SA01" \
-i "pandas.MultiIndex.to_frame RT03" \
-i "pandas.NA SA01" \
-i "pandas.NaT SA01" \
- -i "pandas.Period.asfreq SA01" \
-i "pandas.Period.freq GL08" \
-i "pandas.Period.freqstr SA01" \
- -i "pandas.Period.month SA01" \
-i "pandas.Period.ordinal GL08" \
-i "pandas.Period.strftime PR01,SA01" \
-i "pandas.Period.to_timestamp SA01" \
- -i "pandas.Period.year SA01" \
-i "pandas.PeriodDtype SA01" \
-i "pandas.PeriodDtype.freq SA01" \
-i "pandas.PeriodIndex.day SA01" \
@@ -158,28 +153,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Series.sparse.sp_values SA01" \
-i "pandas.Series.sparse.to_coo PR07,RT03,SA01" \
-i "pandas.Series.std PR01,RT03,SA01" \
- -i "pandas.Series.str.capitalize RT03" \
- -i "pandas.Series.str.casefold RT03" \
- -i "pandas.Series.str.center RT03,SA01" \
- -i "pandas.Series.str.decode PR07,RT03,SA01" \
- -i "pandas.Series.str.encode PR07,RT03,SA01" \
- -i "pandas.Series.str.index RT03" \
- -i "pandas.Series.str.ljust RT03,SA01" \
- -i "pandas.Series.str.lower RT03" \
- -i "pandas.Series.str.lstrip RT03" \
-i "pandas.Series.str.match RT03" \
-i "pandas.Series.str.normalize RT03,SA01" \
- -i "pandas.Series.str.partition RT03" \
-i "pandas.Series.str.repeat SA01" \
-i "pandas.Series.str.replace SA01" \
- -i "pandas.Series.str.rindex RT03" \
- -i "pandas.Series.str.rjust RT03,SA01" \
- -i "pandas.Series.str.rpartition RT03" \
- -i "pandas.Series.str.rstrip RT03" \
- -i "pandas.Series.str.strip RT03" \
- -i "pandas.Series.str.swapcase RT03" \
- -i "pandas.Series.str.title RT03" \
- -i "pandas.Series.str.upper RT03" \
-i "pandas.Series.str.wrap RT03,SA01" \
-i "pandas.Series.str.zfill RT03" \
-i "pandas.Series.struct.dtypes SA01" \
@@ -229,13 +206,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Timestamp.to_julian_date SA01" \
-i "pandas.Timestamp.today SA01" \
-i "pandas.Timestamp.toordinal SA01" \
- -i "pandas.Timestamp.tz_localize SA01" \
-i "pandas.Timestamp.tzinfo GL08" \
- -i "pandas.Timestamp.tzname SA01" \
- -i "pandas.Timestamp.unit SA01" \
- -i "pandas.Timestamp.utcfromtimestamp PR01,SA01" \
- -i "pandas.Timestamp.utcoffset SA01" \
- -i "pandas.Timestamp.utctimetuple SA01" \
-i "pandas.Timestamp.value GL08" \
-i "pandas.Timestamp.year GL08" \
-i "pandas.api.extensions.ExtensionArray._pad_or_backfill PR01,RT03,SA01" \
@@ -259,7 +230,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.api.extensions.ExtensionArray.view SA01" \
-i "pandas.api.interchange.from_dataframe RT03,SA01" \
-i "pandas.api.types.is_bool PR01,SA01" \
- -i "pandas.api.types.is_bool_dtype SA01" \
-i "pandas.api.types.is_categorical_dtype SA01" \
-i "pandas.api.types.is_complex PR01,SA01" \
-i "pandas.api.types.is_complex_dtype SA01" \
@@ -421,156 +391,103 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.set_eng_float_format RT03,SA01" \
-i "pandas.testing.assert_extension_array_equal SA01" \
-i "pandas.tseries.offsets.BDay PR02,SA01" \
- -i "pandas.tseries.offsets.BQuarterBegin PR02" \
- -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \
-i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.BQuarterBegin.n GL08" \
- -i "pandas.tseries.offsets.BQuarterBegin.nanos GL08" \
-i "pandas.tseries.offsets.BQuarterBegin.normalize GL08" \
-i "pandas.tseries.offsets.BQuarterBegin.rule_code GL08" \
-i "pandas.tseries.offsets.BQuarterBegin.startingMonth GL08" \
- -i "pandas.tseries.offsets.BQuarterEnd.freqstr SA01" \
-i "pandas.tseries.offsets.BQuarterEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.BQuarterEnd.n GL08" \
- -i "pandas.tseries.offsets.BQuarterEnd.nanos GL08" \
-i "pandas.tseries.offsets.BQuarterEnd.normalize GL08" \
-i "pandas.tseries.offsets.BQuarterEnd.rule_code GL08" \
-i "pandas.tseries.offsets.BQuarterEnd.startingMonth GL08" \
- -i "pandas.tseries.offsets.BYearBegin.freqstr SA01" \
-i "pandas.tseries.offsets.BYearBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.BYearBegin.month GL08" \
-i "pandas.tseries.offsets.BYearBegin.n GL08" \
- -i "pandas.tseries.offsets.BYearBegin.nanos GL08" \
-i "pandas.tseries.offsets.BYearBegin.normalize GL08" \
- -i "pandas.tseries.offsets.BYearBegin.rule_code GL08" \
- -i "pandas.tseries.offsets.BYearEnd PR02" \
- -i "pandas.tseries.offsets.BYearEnd.freqstr SA01" \
-i "pandas.tseries.offsets.BYearEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.BYearEnd.month GL08" \
-i "pandas.tseries.offsets.BYearEnd.n GL08" \
- -i "pandas.tseries.offsets.BYearEnd.nanos GL08" \
-i "pandas.tseries.offsets.BYearEnd.normalize GL08" \
- -i "pandas.tseries.offsets.BYearEnd.rule_code GL08" \
-i "pandas.tseries.offsets.BusinessDay PR02,SA01" \
-i "pandas.tseries.offsets.BusinessDay.calendar GL08" \
- -i "pandas.tseries.offsets.BusinessDay.freqstr SA01" \
-i "pandas.tseries.offsets.BusinessDay.holidays GL08" \
-i "pandas.tseries.offsets.BusinessDay.is_on_offset GL08" \
-i "pandas.tseries.offsets.BusinessDay.n GL08" \
- -i "pandas.tseries.offsets.BusinessDay.nanos GL08" \
-i "pandas.tseries.offsets.BusinessDay.normalize GL08" \
- -i "pandas.tseries.offsets.BusinessDay.rule_code GL08" \
-i "pandas.tseries.offsets.BusinessDay.weekmask GL08" \
-i "pandas.tseries.offsets.BusinessHour PR02,SA01" \
-i "pandas.tseries.offsets.BusinessHour.calendar GL08" \
-i "pandas.tseries.offsets.BusinessHour.end GL08" \
- -i "pandas.tseries.offsets.BusinessHour.freqstr SA01" \
-i "pandas.tseries.offsets.BusinessHour.holidays GL08" \
-i "pandas.tseries.offsets.BusinessHour.is_on_offset GL08" \
-i "pandas.tseries.offsets.BusinessHour.n GL08" \
- -i "pandas.tseries.offsets.BusinessHour.nanos GL08" \
-i "pandas.tseries.offsets.BusinessHour.normalize GL08" \
- -i "pandas.tseries.offsets.BusinessHour.rule_code GL08" \
-i "pandas.tseries.offsets.BusinessHour.start GL08" \
-i "pandas.tseries.offsets.BusinessHour.weekmask GL08" \
- -i "pandas.tseries.offsets.BusinessMonthBegin.freqstr SA01" \
-i "pandas.tseries.offsets.BusinessMonthBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.BusinessMonthBegin.n GL08" \
- -i "pandas.tseries.offsets.BusinessMonthBegin.nanos GL08" \
-i "pandas.tseries.offsets.BusinessMonthBegin.normalize GL08" \
- -i "pandas.tseries.offsets.BusinessMonthBegin.rule_code GL08" \
- -i "pandas.tseries.offsets.BusinessMonthEnd.freqstr SA01" \
-i "pandas.tseries.offsets.BusinessMonthEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.BusinessMonthEnd.n GL08" \
- -i "pandas.tseries.offsets.BusinessMonthEnd.nanos GL08" \
-i "pandas.tseries.offsets.BusinessMonthEnd.normalize GL08" \
- -i "pandas.tseries.offsets.BusinessMonthEnd.rule_code GL08" \
-i "pandas.tseries.offsets.CBMonthBegin PR02" \
-i "pandas.tseries.offsets.CBMonthEnd PR02" \
-i "pandas.tseries.offsets.CDay PR02,SA01" \
-i "pandas.tseries.offsets.CustomBusinessDay PR02,SA01" \
-i "pandas.tseries.offsets.CustomBusinessDay.calendar GL08" \
- -i "pandas.tseries.offsets.CustomBusinessDay.freqstr SA01" \
-i "pandas.tseries.offsets.CustomBusinessDay.holidays GL08" \
-i "pandas.tseries.offsets.CustomBusinessDay.is_on_offset GL08" \
-i "pandas.tseries.offsets.CustomBusinessDay.n GL08" \
- -i "pandas.tseries.offsets.CustomBusinessDay.nanos GL08" \
-i "pandas.tseries.offsets.CustomBusinessDay.normalize GL08" \
- -i "pandas.tseries.offsets.CustomBusinessDay.rule_code GL08" \
-i "pandas.tseries.offsets.CustomBusinessDay.weekmask GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour PR02,SA01" \
-i "pandas.tseries.offsets.CustomBusinessHour.calendar GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.end GL08" \
- -i "pandas.tseries.offsets.CustomBusinessHour.freqstr SA01" \
-i "pandas.tseries.offsets.CustomBusinessHour.holidays GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.is_on_offset GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.n GL08" \
- -i "pandas.tseries.offsets.CustomBusinessHour.nanos GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.normalize GL08" \
- -i "pandas.tseries.offsets.CustomBusinessHour.rule_code GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.start GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.weekmask GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthBegin.freqstr SA01" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset SA01" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthBegin.nanos GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthBegin.rule_code GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.weekmask GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthEnd.freqstr SA01" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset SA01" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthEnd.nanos GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthEnd.rule_code GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.weekmask GL08" \
- -i "pandas.tseries.offsets.DateOffset PR02" \
- -i "pandas.tseries.offsets.DateOffset.freqstr SA01" \
-i "pandas.tseries.offsets.DateOffset.is_on_offset GL08" \
-i "pandas.tseries.offsets.DateOffset.n GL08" \
- -i "pandas.tseries.offsets.DateOffset.nanos GL08" \
-i "pandas.tseries.offsets.DateOffset.normalize GL08" \
- -i "pandas.tseries.offsets.DateOffset.rule_code GL08" \
- -i "pandas.tseries.offsets.Day.freqstr SA01" \
-i "pandas.tseries.offsets.Day.is_on_offset GL08" \
-i "pandas.tseries.offsets.Day.n GL08" \
- -i "pandas.tseries.offsets.Day.nanos SA01" \
-i "pandas.tseries.offsets.Day.normalize GL08" \
- -i "pandas.tseries.offsets.Day.rule_code GL08" \
- -i "pandas.tseries.offsets.Easter PR02" \
- -i "pandas.tseries.offsets.Easter.freqstr SA01" \
-i "pandas.tseries.offsets.Easter.is_on_offset GL08" \
-i "pandas.tseries.offsets.Easter.n GL08" \
- -i "pandas.tseries.offsets.Easter.nanos GL08" \
-i "pandas.tseries.offsets.Easter.normalize GL08" \
- -i "pandas.tseries.offsets.Easter.rule_code GL08" \
- -i "pandas.tseries.offsets.FY5253 PR02" \
- -i "pandas.tseries.offsets.FY5253.freqstr SA01" \
-i "pandas.tseries.offsets.FY5253.get_rule_code_suffix GL08" \
-i "pandas.tseries.offsets.FY5253.get_year_end GL08" \
-i "pandas.tseries.offsets.FY5253.is_on_offset GL08" \
-i "pandas.tseries.offsets.FY5253.n GL08" \
- -i "pandas.tseries.offsets.FY5253.nanos GL08" \
-i "pandas.tseries.offsets.FY5253.normalize GL08" \
-i "pandas.tseries.offsets.FY5253.rule_code GL08" \
-i "pandas.tseries.offsets.FY5253.startingMonth GL08" \
-i "pandas.tseries.offsets.FY5253.variation GL08" \
-i "pandas.tseries.offsets.FY5253.weekday GL08" \
- -i "pandas.tseries.offsets.FY5253Quarter PR02" \
- -i "pandas.tseries.offsets.FY5253Quarter.freqstr SA01" \
-i "pandas.tseries.offsets.FY5253Quarter.get_rule_code_suffix GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.get_weeks GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.is_on_offset GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.n GL08" \
- -i "pandas.tseries.offsets.FY5253Quarter.nanos GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.normalize GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.qtr_with_extra_week GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.rule_code GL08" \
@@ -578,139 +495,80 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.tseries.offsets.FY5253Quarter.variation GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.weekday GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.year_has_extra_week GL08" \
- -i "pandas.tseries.offsets.Hour PR02" \
- -i "pandas.tseries.offsets.Hour.freqstr SA01" \
-i "pandas.tseries.offsets.Hour.is_on_offset GL08" \
-i "pandas.tseries.offsets.Hour.n GL08" \
- -i "pandas.tseries.offsets.Hour.nanos SA01" \
-i "pandas.tseries.offsets.Hour.normalize GL08" \
- -i "pandas.tseries.offsets.Hour.rule_code GL08" \
- -i "pandas.tseries.offsets.LastWeekOfMonth PR02,SA01" \
- -i "pandas.tseries.offsets.LastWeekOfMonth.freqstr SA01" \
+ -i "pandas.tseries.offsets.LastWeekOfMonth SA01" \
-i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \
-i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \
- -i "pandas.tseries.offsets.LastWeekOfMonth.nanos GL08" \
-i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \
- -i "pandas.tseries.offsets.LastWeekOfMonth.rule_code GL08" \
-i "pandas.tseries.offsets.LastWeekOfMonth.week GL08" \
-i "pandas.tseries.offsets.LastWeekOfMonth.weekday GL08" \
- -i "pandas.tseries.offsets.Micro PR02" \
- -i "pandas.tseries.offsets.Micro.freqstr SA01" \
-i "pandas.tseries.offsets.Micro.is_on_offset GL08" \
-i "pandas.tseries.offsets.Micro.n GL08" \
- -i "pandas.tseries.offsets.Micro.nanos SA01" \
-i "pandas.tseries.offsets.Micro.normalize GL08" \
- -i "pandas.tseries.offsets.Micro.rule_code GL08" \
- -i "pandas.tseries.offsets.Milli PR02" \
- -i "pandas.tseries.offsets.Milli.freqstr SA01" \
-i "pandas.tseries.offsets.Milli.is_on_offset GL08" \
-i "pandas.tseries.offsets.Milli.n GL08" \
- -i "pandas.tseries.offsets.Milli.nanos SA01" \
-i "pandas.tseries.offsets.Milli.normalize GL08" \
- -i "pandas.tseries.offsets.Milli.rule_code GL08" \
- -i "pandas.tseries.offsets.Minute PR02" \
- -i "pandas.tseries.offsets.Minute.freqstr SA01" \
-i "pandas.tseries.offsets.Minute.is_on_offset GL08" \
-i "pandas.tseries.offsets.Minute.n GL08" \
- -i "pandas.tseries.offsets.Minute.nanos SA01" \
-i "pandas.tseries.offsets.Minute.normalize GL08" \
- -i "pandas.tseries.offsets.Minute.rule_code GL08" \
- -i "pandas.tseries.offsets.MonthBegin PR02" \
- -i "pandas.tseries.offsets.MonthBegin.freqstr SA01" \
-i "pandas.tseries.offsets.MonthBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.MonthBegin.n GL08" \
- -i "pandas.tseries.offsets.MonthBegin.nanos GL08" \
-i "pandas.tseries.offsets.MonthBegin.normalize GL08" \
- -i "pandas.tseries.offsets.MonthBegin.rule_code GL08" \
- -i "pandas.tseries.offsets.MonthEnd.freqstr SA01" \
-i "pandas.tseries.offsets.MonthEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.MonthEnd.n GL08" \
- -i "pandas.tseries.offsets.MonthEnd.nanos GL08" \
-i "pandas.tseries.offsets.MonthEnd.normalize GL08" \
- -i "pandas.tseries.offsets.MonthEnd.rule_code GL08" \
- -i "pandas.tseries.offsets.Nano PR02" \
- -i "pandas.tseries.offsets.Nano.freqstr SA01" \
-i "pandas.tseries.offsets.Nano.is_on_offset GL08" \
- -i "pandas.tseries.offsets.Nano.n GL08" \
- -i "pandas.tseries.offsets.Nano.nanos SA01" \
-i "pandas.tseries.offsets.Nano.normalize GL08" \
- -i "pandas.tseries.offsets.Nano.rule_code GL08" \
- -i "pandas.tseries.offsets.QuarterBegin PR02" \
- -i "pandas.tseries.offsets.QuarterBegin.freqstr SA01" \
+ -i "pandas.tseries.offsets.Nano.n GL08" \
-i "pandas.tseries.offsets.QuarterBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.QuarterBegin.n GL08" \
- -i "pandas.tseries.offsets.QuarterBegin.nanos GL08" \
-i "pandas.tseries.offsets.QuarterBegin.normalize GL08" \
-i "pandas.tseries.offsets.QuarterBegin.rule_code GL08" \
-i "pandas.tseries.offsets.QuarterBegin.startingMonth GL08" \
- -i "pandas.tseries.offsets.QuarterEnd.freqstr SA01" \
-i "pandas.tseries.offsets.QuarterEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.QuarterEnd.n GL08" \
- -i "pandas.tseries.offsets.QuarterEnd.nanos GL08" \
-i "pandas.tseries.offsets.QuarterEnd.normalize GL08" \
-i "pandas.tseries.offsets.QuarterEnd.rule_code GL08" \
-i "pandas.tseries.offsets.QuarterEnd.startingMonth GL08" \
- -i "pandas.tseries.offsets.Second PR02" \
- -i "pandas.tseries.offsets.Second.freqstr SA01" \
-i "pandas.tseries.offsets.Second.is_on_offset GL08" \
-i "pandas.tseries.offsets.Second.n GL08" \
- -i "pandas.tseries.offsets.Second.nanos SA01" \
-i "pandas.tseries.offsets.Second.normalize GL08" \
- -i "pandas.tseries.offsets.Second.rule_code GL08" \
- -i "pandas.tseries.offsets.SemiMonthBegin PR02,SA01" \
+ -i "pandas.tseries.offsets.SemiMonthBegin SA01" \
-i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \
- -i "pandas.tseries.offsets.SemiMonthBegin.freqstr SA01" \
-i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \
- -i "pandas.tseries.offsets.SemiMonthBegin.nanos GL08" \
-i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \
-i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \
-i "pandas.tseries.offsets.SemiMonthEnd SA01" \
-i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \
- -i "pandas.tseries.offsets.SemiMonthEnd.freqstr SA01" \
-i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \
- -i "pandas.tseries.offsets.SemiMonthEnd.nanos GL08" \
-i "pandas.tseries.offsets.SemiMonthEnd.normalize GL08" \
-i "pandas.tseries.offsets.SemiMonthEnd.rule_code GL08" \
-i "pandas.tseries.offsets.Tick GL08" \
- -i "pandas.tseries.offsets.Tick.freqstr SA01" \
-i "pandas.tseries.offsets.Tick.is_on_offset GL08" \
-i "pandas.tseries.offsets.Tick.n GL08" \
- -i "pandas.tseries.offsets.Tick.nanos SA01" \
-i "pandas.tseries.offsets.Tick.normalize GL08" \
- -i "pandas.tseries.offsets.Tick.rule_code GL08" \
- -i "pandas.tseries.offsets.Week PR02" \
- -i "pandas.tseries.offsets.Week.freqstr SA01" \
-i "pandas.tseries.offsets.Week.is_on_offset GL08" \
-i "pandas.tseries.offsets.Week.n GL08" \
- -i "pandas.tseries.offsets.Week.nanos GL08" \
-i "pandas.tseries.offsets.Week.normalize GL08" \
- -i "pandas.tseries.offsets.Week.rule_code GL08" \
-i "pandas.tseries.offsets.Week.weekday GL08" \
- -i "pandas.tseries.offsets.WeekOfMonth PR02,SA01" \
- -i "pandas.tseries.offsets.WeekOfMonth.freqstr SA01" \
+ -i "pandas.tseries.offsets.WeekOfMonth SA01" \
-i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \
-i "pandas.tseries.offsets.WeekOfMonth.n GL08" \
- -i "pandas.tseries.offsets.WeekOfMonth.nanos GL08" \
-i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \
- -i "pandas.tseries.offsets.WeekOfMonth.rule_code GL08" \
-i "pandas.tseries.offsets.WeekOfMonth.week GL08" \
-i "pandas.tseries.offsets.WeekOfMonth.weekday GL08" \
- -i "pandas.tseries.offsets.YearBegin.freqstr SA01" \
-i "pandas.tseries.offsets.YearBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.YearBegin.month GL08" \
-i "pandas.tseries.offsets.YearBegin.n GL08" \
- -i "pandas.tseries.offsets.YearBegin.nanos GL08" \
-i "pandas.tseries.offsets.YearBegin.normalize GL08" \
- -i "pandas.tseries.offsets.YearBegin.rule_code GL08" \
- -i "pandas.tseries.offsets.YearEnd.freqstr SA01" \
-i "pandas.tseries.offsets.YearEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.YearEnd.month GL08" \
-i "pandas.tseries.offsets.YearEnd.n GL08" \
- -i "pandas.tseries.offsets.YearEnd.nanos GL08" \
-i "pandas.tseries.offsets.YearEnd.normalize GL08" \
- -i "pandas.tseries.offsets.YearEnd.rule_code GL08" \
-i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function
RET=$(($RET + $?)) ; echo $MSG "DONE"
diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml
index 0c46f476893dd..e670356c95637 100644
--- a/ci/deps/actions-310-minimum_versions.yaml
+++ b/ci/deps/actions-310-minimum_versions.yaml
@@ -23,7 +23,6 @@ dependencies:
# required dependencies
- python-dateutil=2.8.2
- numpy=1.23.5
- - pytz=2020.1
# optional dependencies
- beautifulsoup4=4.11.2
@@ -49,6 +48,7 @@ dependencies:
- pyreadstat=1.2.0
- pytables=3.8.0
- python-calamine=0.1.7
+ - pytz=2023.4
- pyxlsb=1.0.10
- s3fs=2022.11.0
- scipy=1.10.0
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index 0af46752f5b3d..c33c0344e742f 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -21,7 +21,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -47,6 +46,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index 1a842c7212c1f..8692b6e35ab2d 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -22,7 +22,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -48,6 +47,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml
index 748cfa861ec32..996ce5cd9ab94 100644
--- a/ci/deps/actions-311-numpydev.yaml
+++ b/ci/deps/actions-311-numpydev.yaml
@@ -18,7 +18,6 @@ dependencies:
# pandas dependencies
- python-dateutil
- - pytz
- pip
- pip:
diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml
index 469fb1bfb9138..434f1d4f7fed2 100644
--- a/ci/deps/actions-311-pyarrownightly.yaml
+++ b/ci/deps/actions-311-pyarrownightly.yaml
@@ -19,7 +19,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy<2
- - pytz
- pip
- pip:
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index 75394e2c8e109..8e7d9aba7878d 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -21,7 +21,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -47,6 +46,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
index d4b43ddef3601..6c97960a62d40 100644
--- a/ci/deps/actions-312.yaml
+++ b/ci/deps/actions-312.yaml
@@ -21,7 +21,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -47,6 +46,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml
index b0ae9f1e48473..c157d2e65c001 100644
--- a/ci/deps/actions-pypy-39.yaml
+++ b/ci/deps/actions-pypy-39.yaml
@@ -22,6 +22,5 @@ dependencies:
# required
- numpy
- python-dateutil
- - pytz
- pip:
- tzdata>=2022.7
diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml
index 18535d81e6985..c86534871b3d2 100644
--- a/ci/deps/circle-311-arm64.yaml
+++ b/ci/deps/circle-311-arm64.yaml
@@ -21,7 +21,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -47,6 +46,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/ci/meta.yaml b/ci/meta.yaml
index b76bef2f630b7..9d434991b12c1 100644
--- a/ci/meta.yaml
+++ b/ci/meta.yaml
@@ -37,7 +37,6 @@ requirements:
- numpy >=1.21.6 # [py<311]
- numpy >=1.23.2 # [py>=311]
- python-dateutil >=2.8.2
- - pytz >=2020.1
- python-tzdata >=2022.7
test:
diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
index 28129440b86d7..277f407ae4418 100644
--- a/doc/source/development/contributing_codebase.rst
+++ b/doc/source/development/contributing_codebase.rst
@@ -762,8 +762,7 @@ install pandas) by typing::
your installation is probably fine and you can start contributing!
Often it is worth running only a subset of tests first around your changes before running the
-entire suite (tip: you can use the `pandas-coverage app `_)
-to find out which tests hit the lines of code you've modified, and then run only those).
+entire suite.
The easiest way to do this is with::
diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst
index 0b8c1e16dce0e..e174eea00ca60 100644
--- a/doc/source/development/contributing_docstring.rst
+++ b/doc/source/development/contributing_docstring.rst
@@ -142,7 +142,7 @@ backticks. The following are considered inline code:
With several mistakes in the docstring.
- It has a blank like after the signature ``def func():``.
+ It has a blank line after the signature ``def func():``.
The text 'Some function' should go in the line after the
opening quotes of the docstring, not in the same line.
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index 86ce05fde547b..8e6cb9e9a132d 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -205,7 +205,6 @@ Package Minimum support
================================================================ ==========================
`NumPy `__ 1.23.5
`python-dateutil `__ 2.8.2
-`pytz `__ 2020.1
`tzdata `__ 2022.7
================================================================ ==========================
@@ -419,3 +418,14 @@ Dependency Minimum Version pip extra Notes
========================= ================== =============== =============================================================
Zstandard 0.19.0 compression Zstandard compression
========================= ================== =============== =============================================================
+
+Timezone
+^^^^^^^^
+
+Installable with ``pip install "pandas[timezone]"``
+
+========================= ================== =================== =============================================================
+Dependency Minimum Version pip extra Notes
+========================= ================== =================== =============================================================
+pytz 2023.4 timezone Alternative timezone library to ``zoneinfo``.
+========================= ================== =================== =============================================================
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index 0845417e4910d..4299dca4774b9 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -2569,7 +2569,7 @@ Ambiguous times when localizing
because daylight savings time (DST) in a local time zone causes some times to occur
twice within one day ("clocks fall back"). The following options are available:
-* ``'raise'``: Raises a ``pytz.AmbiguousTimeError`` (the default behavior)
+* ``'raise'``: Raises a ``ValueError`` (the default behavior)
* ``'infer'``: Attempt to determine the correct offset base on the monotonicity of the timestamps
* ``'NaT'``: Replaces ambiguous times with ``NaT``
* ``bool``: ``True`` represents a DST time, ``False`` represents non-DST time. An array-like of ``bool`` values is supported for a sequence of times.
@@ -2604,7 +2604,7 @@ A DST transition may also shift the local time ahead by 1 hour creating nonexist
local times ("clocks spring forward"). The behavior of localizing a timeseries with nonexistent times
can be controlled by the ``nonexistent`` argument. The following options are available:
-* ``'raise'``: Raises a ``pytz.NonExistentTimeError`` (the default behavior)
+* ``'raise'``: Raises a ``ValueError`` (the default behavior)
* ``'NaT'``: Replaces nonexistent times with ``NaT``
* ``'shift_forward'``: Shifts nonexistent times forward to the closest real time
* ``'shift_backward'``: Shifts nonexistent times backward to the closest real time
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 3de65fe6f682c..f25edd39cf7da 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -50,8 +50,10 @@ Other enhancements
- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`)
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
+- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
+- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
@@ -220,6 +222,8 @@ Optional libraries below the lowest tested version may still work, but are not c
+------------------------+---------------------+
| Package | New Minimum Version |
+========================+=====================+
+| pytz | 2023.4 |
++------------------------+---------------------+
| fastparquet | 2023.10.0 |
+------------------------+---------------------+
| adbc-driver-postgresql | 0.10.0 |
@@ -229,6 +233,37 @@ Optional libraries below the lowest tested version may still work, but are not c
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
+.. _whatsnew_300.api_breaking.pytz:
+
+``pytz`` now an optional dependency
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+pandas now uses :py:mod:`zoneinfo` from the standard library as the default timezone implementation when passing a timezone
+string to various methods. (:issue:`34916`)
+
+*Old behavior:*
+
+.. code-block:: ipython
+
+ In [1]: ts = pd.Timestamp(2024, 1, 1).tz_localize("US/Pacific")
+ In [2]: ts.tz
+
+
+*New behavior:*
+
+.. ipython:: python
+
+ ts = pd.Timestamp(2024, 1, 1).tz_localize("US/Pacific")
+ ts.tz
+
+``pytz`` timezone objects are still supported when passed directly, but they will no longer be returned by default
+from string inputs. Moreover, ``pytz`` is no longer a required dependency of pandas, but can be installed
+with the pip extra ``pip install pandas[timezone]``.
+
+
+Additionally, pandas no longer throws ``pytz`` exceptions for timezone operations leading to ambiguous or nonexistent
+times. These cases will now raise a ``ValueError``.
+
.. _whatsnew_300.api_breaking.other:
Other API changes
@@ -618,7 +653,9 @@ Groupby/resample/rolling
Reshaping
^^^^^^^^^
+- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
+- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
@@ -631,6 +668,7 @@ ExtensionArray
^^^^^^^^^^^^^^
- Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`)
- Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
+- Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`)
- Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`)
Styler
diff --git a/environment.yml b/environment.yml
index e5646af07c45c..34bc0591ca8df 100644
--- a/environment.yml
+++ b/environment.yml
@@ -24,7 +24,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy<2
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -50,6 +49,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 3ee6f6abf97bf..05547e50bbb37 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -3,7 +3,7 @@
__docformat__ = "restructuredtext"
# Let users know if they're missing any of our hard dependencies
-_hard_dependencies = ("numpy", "pytz", "dateutil")
+_hard_dependencies = ("numpy", "dateutil")
_missing_dependencies = []
for _dependency in _hard_dependencies:
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index 51794ec04b29e..4ed2d4c3be692 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -426,6 +426,11 @@ def option_context(*args) -> Generator[None, None, None]:
None
No return value.
+ Yields
+ ------
+ None
+ No yield value.
+
See Also
--------
get_option : Retrieve the value of the specified option.
diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
index 0fadbbbed2c72..a635dd33f8420 100644
--- a/pandas/_libs/tslibs/conversion.pyx
+++ b/pandas/_libs/tslibs/conversion.pyx
@@ -69,6 +69,7 @@ from pandas._libs.tslibs.timestamps cimport _Timestamp
from pandas._libs.tslibs.timezones cimport (
get_utcoffset,
is_utc,
+ treat_tz_as_pytz,
)
from pandas._libs.tslibs.tzconversion cimport (
Localizer,
@@ -747,11 +748,17 @@ cdef datetime _localize_pydatetime(datetime dt, tzinfo tz):
identically, i.e. discards nanos from Timestamps.
It also assumes that the `tz` input is not None.
"""
- try:
+ if treat_tz_as_pytz(tz):
+ import pytz
+
# datetime.replace with pytz may be incorrect result
# TODO: try to respect `fold` attribute
- return tz.localize(dt, is_dst=None)
- except AttributeError:
+ try:
+ return tz.localize(dt, is_dst=None)
+ except (pytz.AmbiguousTimeError, pytz.NonExistentTimeError) as err:
+ # As of pandas 3.0, we raise ValueErrors instead of pytz exceptions
+ raise ValueError(str(err)) from err
+ else:
return dt.replace(tzinfo=tz)
diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx
index 6ae5a96c428c2..3cb4dda1cd273 100644
--- a/pandas/_libs/tslibs/nattype.pyx
+++ b/pandas/_libs/tslibs/nattype.pyx
@@ -602,7 +602,24 @@ class NaTType(_NaT):
utctimetuple = _make_error_func(
"utctimetuple",
"""
- Return UTC time tuple, compatible with time.localtime().
+ Return UTC time tuple, compatible with `time.localtime()`.
+
+ This method converts the Timestamp to UTC and returns a time tuple
+ containing 9 components: year, month, day, hour, minute, second,
+ weekday, day of year, and DST flag. This is particularly useful for
+ converting a Timestamp to a format compatible with time module functions.
+
+ Returns
+ -------
+ time.struct_time
+ A time.struct_time object representing the UTC time.
+
+ See Also
+ --------
+ datetime.datetime.utctimetuple :
+ Return UTC time tuple, compatible with time.localtime().
+ Timestamp.timetuple : Return time tuple of local time.
+ time.struct_time : Time tuple structure used by time functions.
Examples
--------
@@ -619,6 +636,22 @@ class NaTType(_NaT):
"""
Return utc offset.
+ This method returns the difference between UTC and the local time
+ as a `timedelta` object. It is useful for understanding the time
+ difference between the current timezone and UTC.
+
+ Returns
+ --------
+ timedelta
+ The difference between UTC and the local time as a `timedelta` object.
+
+ See Also
+ --------
+ datetime.datetime.utcoffset :
+ Standard library method to get the UTC offset of a datetime object.
+ Timestamp.tzname : Return the name of the timezone.
+ Timestamp.dst : Return the daylight saving time (DST) adjustment.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -633,6 +666,13 @@ class NaTType(_NaT):
"""
Return time zone name.
+ This method returns the name of the Timestamp's time zone as a string.
+
+ See Also
+ --------
+ Timestamp.tzinfo : Returns the timezone information of the Timestamp.
+ Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -772,6 +812,21 @@ class NaTType(_NaT):
Construct a timezone-aware UTC datetime from a POSIX timestamp.
+ This method creates a datetime object from a POSIX timestamp, keeping the
+ Timestamp object's timezone.
+
+ Parameters
+ ----------
+ ts : float
+ POSIX timestamp.
+
+ See Also
+ --------
+ Timezone.tzname : Return time zone name.
+ Timestamp.utcnow : Return a new Timestamp representing UTC day and time.
+ Timestamp.fromtimestamp : Transform timestamp[, tz] to tz's local
+ time from POSIX timestamp.
+
Notes
-----
Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp
@@ -1052,9 +1107,9 @@ class NaTType(_NaT):
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -1065,7 +1120,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Returns
@@ -1153,9 +1208,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -1166,7 +1221,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Raises
@@ -1248,9 +1303,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -1261,7 +1316,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Raises
@@ -1412,9 +1467,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \
+ nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \
default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -1427,7 +1482,7 @@ default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Returns
@@ -1439,6 +1494,13 @@ default 'raise'
TypeError
If the Timestamp is tz-aware and tz is not None.
+ See Also
+ --------
+ Timestamp.tzinfo : Returns the timezone information of the Timestamp.
+ Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone.
+ DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone.
+ datetime.datetime.astimezone : Convert a datetime object to another time zone.
+
Examples
--------
Create a naive timestamp object:
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
index 554c4f109f1c5..c48acc07b34db 100644
--- a/pandas/_libs/tslibs/offsets.pyx
+++ b/pandas/_libs/tslibs/offsets.pyx
@@ -595,6 +595,24 @@ cdef class BaseOffset:
@property
def rule_code(self) -> str:
+ """
+ Return a string representing the base frequency.
+
+ See Also
+ --------
+ tseries.offsets.Hour.rule_code :
+ Returns a string representing the base frequency of 'h'.
+ tseries.offsets.Day.rule_code :
+ Returns a string representing the base frequency of 'D'.
+
+ Examples
+ --------
+ >>> pd.offsets.Hour().rule_code
+ 'h'
+
+ >>> pd.offsets.Week(5).rule_code
+ 'W'
+ """
return self._prefix
@cache_readonly
@@ -602,6 +620,17 @@ cdef class BaseOffset:
"""
Return a string representing the frequency.
+ See Also
+ --------
+ tseries.offsets.BusinessDay.freqstr :
+ Return a string representing an offset frequency in Business Days.
+ tseries.offsets.BusinessHour.freqstr :
+ Return a string representing an offset frequency in Business Hours.
+ tseries.offsets.Week.freqstr :
+ Return a string representing an offset frequency in Weeks.
+ tseries.offsets.Hour.freqstr :
+ Return a string representing an offset frequency in Hours.
+
Examples
--------
>>> pd.DateOffset(5).freqstr
@@ -779,6 +808,26 @@ cdef class BaseOffset:
@property
def nanos(self):
+ """
+ Returns a integer of the total number of nanoseconds for fixed frequencies.
+
+ Raises
+ ------
+ ValueError
+ If the frequency is non-fixed.
+
+ See Also
+ --------
+ tseries.offsets.Hour.nanos :
+ Returns an integer of the total number of nanoseconds.
+ tseries.offsets.Day.nanos :
+ Returns an integer of the total number of nanoseconds.
+
+ Examples
+ --------
+ >>> pd.offsets.Week(n=1).nanos
+ ValueError: Week: weekday=None is a non-fixed frequency
+ """
raise ValueError(f"{self} is a non-fixed frequency")
# ------------------------------------------------------------------
@@ -986,12 +1035,14 @@ cdef class Tick(SingleConstructorOffset):
@property
def nanos(self) -> int64_t:
"""
- Return an integer of the total number of nanoseconds.
+ Returns an integer of the total number of nanoseconds.
- Raises
- ------
- ValueError
- If the frequency is non-fixed.
+ See Also
+ --------
+ tseries.offsets.Hour.nanos :
+ Returns an integer of the total number of nanoseconds.
+ tseries.offsets.Day.nanos :
+ Returns an integer of the total number of nanoseconds.
Examples
--------
@@ -1147,7 +1198,7 @@ cdef class Hour(Tick):
"""
Offset ``n`` hours.
- Parameters
+ Attributes
----------
n : int, default 1
The number of hours represented.
@@ -1183,7 +1234,7 @@ cdef class Minute(Tick):
"""
Offset ``n`` minutes.
- Parameters
+ Attributes
----------
n : int, default 1
The number of minutes represented.
@@ -1219,7 +1270,7 @@ cdef class Second(Tick):
"""
Offset ``n`` seconds.
- Parameters
+ Attributes
----------
n : int, default 1
The number of seconds represented.
@@ -1255,7 +1306,7 @@ cdef class Milli(Tick):
"""
Offset ``n`` milliseconds.
- Parameters
+ Attributes
----------
n : int, default 1
The number of milliseconds represented.
@@ -1292,7 +1343,7 @@ cdef class Micro(Tick):
"""
Offset ``n`` microseconds.
- Parameters
+ Attributes
----------
n : int, default 1
The number of microseconds represented.
@@ -1329,7 +1380,7 @@ cdef class Nano(Tick):
"""
Offset ``n`` nanoseconds.
- Parameters
+ Attributes
----------
n : int, default 1
The number of nanoseconds represented.
@@ -1616,7 +1667,7 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta):
Besides, adding a DateOffsets specified by the singular form of the date
component can be used to replace certain component of the timestamp.
- Parameters
+ Attributes
----------
n : int, default 1
The number of time periods the offset represents.
@@ -2426,6 +2477,24 @@ cdef class WeekOfMonthMixin(SingleConstructorOffset):
@property
def rule_code(self) -> str:
+ """
+ Return a string representing the base frequency.
+
+ See Also
+ --------
+ tseries.offsets.Hour.rule_code :
+ Returns a string representing the base frequency of 'h'.
+ tseries.offsets.Day.rule_code :
+ Returns a string representing the base frequency of 'D'.
+
+ Examples
+ --------
+ >>> pd.offsets.Week(5).rule_code
+ 'W'
+
+ >>> pd.offsets.WeekOfMonth(n=1, week=0, weekday=0).rule_code
+ 'WOM-1MON'
+ """
weekday = int_to_weekday.get(self.weekday, "")
if self.week == -1:
# LastWeekOfMonth
@@ -2472,6 +2541,24 @@ cdef class YearOffset(SingleConstructorOffset):
@property
def rule_code(self) -> str:
+ """
+ Return a string representing the base frequency.
+
+ See Also
+ --------
+ tseries.offsets.Hour.rule_code :
+ Returns a string representing the base frequency of 'h'.
+ tseries.offsets.Day.rule_code :
+ Returns a string representing the base frequency of 'D'.
+
+ Examples
+ --------
+ >>> pd.tseries.offsets.YearBegin(n=1, month=2).rule_code
+ 'YS-FEB'
+
+ >>> pd.tseries.offsets.YearEnd(n=1, month=6).rule_code
+ 'YE-JUN'
+ """
month = MONTH_ALIASES[self.month]
return f"{self._prefix}-{month}"
@@ -2506,7 +2593,7 @@ cdef class BYearEnd(YearOffset):
"""
DateOffset increments between the last business day of the year.
- Parameters
+ Attributes
----------
n : int, default 1
The number of years represented.
@@ -2804,7 +2891,7 @@ cdef class BQuarterBegin(QuarterOffset):
startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ...
startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ...
- Parameters
+ Attributes
----------
n : int, default 1
The number of quarters represented.
@@ -2886,7 +2973,7 @@ cdef class QuarterBegin(QuarterOffset):
startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ...
startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ...
- Parameters
+ Attributes
----------
n : int, default 1
The number of quarters represented.
@@ -2984,7 +3071,7 @@ cdef class MonthBegin(MonthOffset):
MonthBegin goes to the next date which is a start of the month.
- Parameters
+ Attributes
----------
n : int, default 1
The number of months represented.
@@ -3272,7 +3359,7 @@ cdef class SemiMonthBegin(SemiMonthOffset):
"""
Two DateOffset's per month repeating on the first day of the month & day_of_month.
- Parameters
+ Attributes
----------
n : int, default 1
The number of months represented.
@@ -3304,7 +3391,7 @@ cdef class Week(SingleConstructorOffset):
"""
Weekly offset.
- Parameters
+ Attributes
----------
n : int, default 1
The number of weeks represented.
@@ -3458,6 +3545,24 @@ cdef class Week(SingleConstructorOffset):
@property
def rule_code(self) -> str:
+ """
+ Return a string representing the base frequency.
+
+ See Also
+ --------
+ tseries.offsets.Hour.name :
+ Returns a string representing the base frequency of 'h'.
+ tseries.offsets.Day.name :
+ Returns a string representing the base frequency of 'D'.
+
+ Examples
+ --------
+ >>> pd.offsets.Hour().rule_code
+ 'h'
+
+ >>> pd.offsets.Week(5).rule_code
+ 'W'
+ """
suffix = ""
if self.weekday is not None:
weekday = int_to_weekday[self.weekday]
@@ -3477,7 +3582,7 @@ cdef class WeekOfMonth(WeekOfMonthMixin):
"""
Describes monthly dates like "the Tuesday of the 2nd week of each month".
- Parameters
+ Attributes
----------
n : int, default 1
The number of months represented.
@@ -3554,7 +3659,7 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin):
For example "the last Tuesday of each month".
- Parameters
+ Attributes
----------
n : int, default 1
The number of months represented.
@@ -3694,7 +3799,7 @@ cdef class FY5253(FY5253Mixin):
X is a specific day of the week.
Y is a certain month of the year
- Parameters
+ Attributes
----------
n : int
The number of fiscal years represented.
@@ -3897,7 +4002,7 @@ cdef class FY5253Quarter(FY5253Mixin):
startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ...
- Parameters
+ Attributes
----------
n : int
The number of business quarters represented.
@@ -4132,7 +4237,7 @@ cdef class Easter(SingleConstructorOffset):
Right now uses the revised method which is valid in years 1583-4099.
- Parameters
+ Attributes
----------
n : int, default 1
The number of years represented.
diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
index c6ba97fe9f1a2..4f5dfc75a20bf 100644
--- a/pandas/_libs/tslibs/period.pyx
+++ b/pandas/_libs/tslibs/period.pyx
@@ -1913,20 +1913,58 @@ cdef class _Period(PeriodMixin):
Parameters
----------
freq : str, BaseOffset
- The desired frequency. If passing a `str`, it needs to be a
- valid :ref:`period alias `.
+ The target frequency to convert the Period object to.
+ If a string is provided,
+ it must be a valid :ref:`period alias `.
+
how : {'E', 'S', 'end', 'start'}, default 'end'
- Start or end of the timespan.
+ Specifies whether to align the period to the start or end of the interval:
+ - 'E' or 'end': Align to the end of the interval.
+ - 'S' or 'start': Align to the start of the interval.
Returns
-------
- resampled : Period
+ Period : Period object with the specified frequency, aligned to the parameter.
+
+ See Also
+ --------
+ Period.end_time : Return the end Timestamp.
+ Period.start_time : Return the start Timestamp.
+ Period.dayofyear : Return the day of the year.
+ Period.dayofweek : Return the day of the week.
Examples
--------
- >>> period = pd.Period('2023-1-1', freq='D')
+ Convert a daily period to an hourly period, aligning to the end of the day:
+
+ >>> period = pd.Period('2023-01-01', freq='D')
>>> period.asfreq('h')
Period('2023-01-01 23:00', 'h')
+
+ Convert a monthly period to a daily period, aligning to the start of the month:
+
+ >>> period = pd.Period('2023-01', freq='M')
+ >>> period.asfreq('D', how='start')
+ Period('2023-01-01', 'D')
+
+ Convert a yearly period to a monthly period, aligning to the last month:
+
+ >>> period = pd.Period('2023', freq='Y')
+ >>> period.asfreq('M', how='end')
+ Period('2023-12', 'M')
+
+ Convert a monthly period to an hourly period,
+ aligning to the first day of the month:
+
+ >>> period = pd.Period('2023-01', freq='M')
+ >>> period.asfreq('h', how='start')
+ Period('2023-01-01 00:00', 'H')
+
+ Convert a weekly period to a daily period, aligning to the last day of the week:
+
+ >>> period = pd.Period('2023-08-01', freq='W')
+ >>> period.asfreq('D', how='end')
+ Period('2023-08-04', 'D')
"""
freq = self._maybe_convert_freq(freq)
how = validate_end_alias(how)
@@ -2000,11 +2038,44 @@ cdef class _Period(PeriodMixin):
"""
Return the year this Period falls on.
+ Returns
+ -------
+ int
+
+ See Also
+ --------
+ period.month : Get the month of the year for the given Period.
+ period.day : Return the day of the month the Period falls on.
+
+ Notes
+ -----
+ The year is based on the `ordinal` and `base` attributes of the Period.
+
Examples
--------
- >>> period = pd.Period('2022-01', 'M')
+ Create a Period object for January 2023 and get the year:
+
+ >>> period = pd.Period('2023-01', 'M')
>>> period.year
- 2022
+ 2023
+
+ Create a Period object for 01 January 2023 and get the year:
+
+ >>> period = pd.Period('2023', 'D')
+ >>> period.year
+ 2023
+
+ Get the year for a period representing a quarter:
+
+ >>> period = pd.Period('2023Q2', 'Q')
+ >>> period.year
+ 2023
+
+ Handle a case where the Period object is empty, which results in `NaN`:
+
+ >>> period = pd.Period('nan', 'M')
+ >>> period.year
+ nan
"""
base = self._dtype._dtype_code
return pyear(self.ordinal, base)
@@ -2014,11 +2085,45 @@ cdef class _Period(PeriodMixin):
"""
Return the month this Period falls on.
+ Returns
+ -------
+ int
+
+ See Also
+ --------
+ period.week : Get the week of the year on the given Period.
+ Period.year : Return the year this Period falls on.
+ Period.day : Return the day of the month this Period falls on.
+
+ Notes
+ -----
+ The month is based on the `ordinal` and `base` attributes of the Period.
+
Examples
--------
+ Create a Period object for January 2022 and get the month:
+
>>> period = pd.Period('2022-01', 'M')
>>> period.month
1
+
+ Period object with no specified frequency, resulting in a default frequency:
+
+ >>> period = pd.Period('2022', 'Y')
+ >>> period.month
+ 12
+
+ Create a Period object with a specified frequency but an incomplete date string:
+
+ >>> period = pd.Period('2022', 'M')
+ >>> period.month
+ 1
+
+ Handle a case where the Period object is empty, which results in `NaN`:
+
+ >>> period = pd.Period('nan', 'M')
+ >>> period.month
+ nan
"""
base = self._dtype._dtype_code
return pmonth(self.ordinal, base)
diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
index 43279051e2a30..ccb1a1d6870f7 100644
--- a/pandas/_libs/tslibs/strptime.pyx
+++ b/pandas/_libs/tslibs/strptime.pyx
@@ -16,6 +16,7 @@ FUNCTIONS:
strptime -- Calculates the time struct represented by the passed-in string
"""
from datetime import timezone
+import zoneinfo
from cpython.datetime cimport (
PyDate_Check,
@@ -38,7 +39,6 @@ from _thread import allocate_lock as _thread_allocate_lock
import re
import numpy as np
-import pytz
cimport numpy as cnp
from numpy cimport (
@@ -747,7 +747,7 @@ cdef tzinfo _parse_with_format(
week_of_year_start = 0
elif parse_code == 17:
# e.g. val='2011-12-30T00:00:00.000000UTC'; fmt='%Y-%m-%dT%H:%M:%S.%f%Z'
- tz = pytz.timezone(found_dict["Z"])
+ tz = zoneinfo.ZoneInfo(found_dict["Z"])
elif parse_code == 19:
# e.g. val='March 1, 2018 12:00:00+0400'; fmt='%B %d, %Y %H:%M:%S%z'
tz = parse_timezone_directive(found_dict["z"])
@@ -837,7 +837,7 @@ class TimeRE(_TimeRE):
if key == "Z":
# lazy computation
if self._Z is None:
- self._Z = self.__seqToRE(pytz.all_timezones, "Z")
+ self._Z = self.__seqToRE(zoneinfo.available_timezones(), "Z")
# Note: handling Z is the key difference vs using the stdlib
# _strptime.TimeRE. test_to_datetime_parse_tzname_or_tzoffset with
# fmt='%Y-%m-%d %H:%M:%S %Z' fails with the stdlib version.
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
index 7cb9c852ea1e3..1cbb24084a62b 100644
--- a/pandas/_libs/tslibs/timestamps.pyx
+++ b/pandas/_libs/tslibs/timestamps.pyx
@@ -254,6 +254,28 @@ cdef class _Timestamp(ABCTimestamp):
"""
The abbreviation associated with self._creso.
+ This property returns a string representing the time unit of the Timestamp's
+ resolution. It corresponds to the smallest time unit that can be represented
+ by this Timestamp object. The possible values are:
+ - 's' (second)
+ - 'ms' (millisecond)
+ - 'us' (microsecond)
+ - 'ns' (nanosecond)
+
+ Returns
+ -------
+ str
+ A string abbreviation of the Timestamp's resolution unit:
+ - 's' for second
+ - 'ms' for millisecond
+ - 'us' for microsecond
+ - 'ns' for nanosecond
+
+ See Also
+ --------
+ Timestamp.resolution : Return resolution of the Timestamp.
+ Timedelta : A duration expressing the difference between two dates or times.
+
Examples
--------
>>> pd.Timestamp("2020-01-01 12:34:56").unit
@@ -1590,6 +1612,21 @@ class Timestamp(_Timestamp):
Construct a timezone-aware UTC datetime from a POSIX timestamp.
+ This method creates a datetime object from a POSIX timestamp, keeping the
+ Timestamp object's timezone.
+
+ Parameters
+ ----------
+ ts : float
+ POSIX timestamp.
+
+ See Also
+ --------
+ Timezone.tzname : Return time zone name.
+ Timestamp.utcnow : Return a new Timestamp representing UTC day and time.
+ Timestamp.fromtimestamp : Transform timestamp[, tz] to tz's local
+ time from POSIX timestamp.
+
Notes
-----
Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp
@@ -1765,6 +1802,13 @@ class Timestamp(_Timestamp):
"""
Return time zone name.
+ This method returns the name of the Timestamp's time zone as a string.
+
+ See Also
+ --------
+ Timestamp.tzinfo : Returns the timezone information of the Timestamp.
+ Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -1779,6 +1823,22 @@ class Timestamp(_Timestamp):
"""
Return utc offset.
+ This method returns the difference between UTC and the local time
+ as a `timedelta` object. It is useful for understanding the time
+ difference between the current timezone and UTC.
+
+ Returns
+ --------
+ timedelta
+ The difference between UTC and the local time as a `timedelta` object.
+
+ See Also
+ --------
+ datetime.datetime.utcoffset :
+ Standard library method to get the UTC offset of a datetime object.
+ Timestamp.tzname : Return the name of the timezone.
+ Timestamp.dst : Return the daylight saving time (DST) adjustment.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -1791,7 +1851,24 @@ class Timestamp(_Timestamp):
def utctimetuple(self):
"""
- Return UTC time tuple, compatible with time.localtime().
+ Return UTC time tuple, compatible with `time.localtime()`.
+
+ This method converts the Timestamp to UTC and returns a time tuple
+ containing 9 components: year, month, day, hour, minute, second,
+ weekday, day of year, and DST flag. This is particularly useful for
+ converting a Timestamp to a format compatible with time module functions.
+
+ Returns
+ -------
+ time.struct_time
+ A time.struct_time object representing the UTC time.
+
+ See Also
+ --------
+ datetime.datetime.utctimetuple :
+ Return UTC time tuple, compatible with time.localtime().
+ Timestamp.timetuple : Return time tuple of local time.
+ time.struct_time : Time tuple structure used by time functions.
Examples
--------
@@ -2134,9 +2211,9 @@ class Timestamp(_Timestamp):
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -2147,7 +2224,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Returns
@@ -2237,9 +2314,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -2250,7 +2327,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Raises
@@ -2332,9 +2409,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -2345,7 +2422,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Raises
@@ -2471,9 +2548,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \
+ nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \
default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -2486,7 +2563,7 @@ default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Returns
@@ -2498,6 +2575,13 @@ default 'raise'
TypeError
If the Timestamp is tz-aware and tz is not None.
+ See Also
+ --------
+ Timestamp.tzinfo : Returns the timezone information of the Timestamp.
+ Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone.
+ DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone.
+ datetime.datetime.astimezone : Convert a datetime object to another time zone.
+
Examples
--------
Create a naive timestamp object:
diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx
index 6292b6ce0fd1d..36b644ffc826d 100644
--- a/pandas/_libs/tslibs/timezones.pyx
+++ b/pandas/_libs/tslibs/timezones.pyx
@@ -2,17 +2,10 @@ from datetime import (
timedelta,
timezone,
)
+import zoneinfo
from pandas.compat._optional import import_optional_dependency
-try:
- # py39+
- import zoneinfo
- from zoneinfo import ZoneInfo
-except ImportError:
- zoneinfo = None
- ZoneInfo = None
-
from cpython.datetime cimport (
datetime,
timedelta,
@@ -28,8 +21,8 @@ from dateutil.tz import (
tzutc as _dateutil_tzutc,
)
import numpy as np
-import pytz
-from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo
+
+pytz = import_optional_dependency("pytz", errors="ignore")
cimport numpy as cnp
from numpy cimport int64_t
@@ -45,10 +38,11 @@ from pandas._libs.tslibs.util cimport (
cdef int64_t NPY_NAT = get_nat()
cdef tzinfo utc_stdlib = timezone.utc
-cdef tzinfo utc_pytz = pytz.utc
+cdef tzinfo utc_pytz = pytz.UTC if pytz else None
cdef tzinfo utc_dateutil_str = dateutil_gettz("UTC") # NB: *not* the same as tzutc()
cdef tzinfo utc_zoneinfo = None
+cdef type ZoneInfo = zoneinfo.ZoneInfo
# ----------------------------------------------------------------------
@@ -56,13 +50,13 @@ cdef tzinfo utc_zoneinfo = None
cdef bint is_utc_zoneinfo(tzinfo tz):
# Workaround for cases with missing tzdata
# https://github.com/pandas-dev/pandas/pull/46425#discussion_r830633025
- if tz is None or zoneinfo is None:
+ if tz is None:
return False
global utc_zoneinfo
if utc_zoneinfo is None:
try:
- utc_zoneinfo = ZoneInfo("UTC")
+ utc_zoneinfo = zoneinfo.ZoneInfo("UTC")
except zoneinfo.ZoneInfoNotFoundError:
return False
# Warn if tzdata is too old, even if there is a system tzdata to alert
@@ -74,17 +68,15 @@ cdef bint is_utc_zoneinfo(tzinfo tz):
cpdef inline bint is_utc(tzinfo tz):
return (
- tz is utc_pytz
- or tz is utc_stdlib
+ tz is utc_stdlib
or isinstance(tz, _dateutil_tzutc)
or tz is utc_dateutil_str
or is_utc_zoneinfo(tz)
+ or (utc_pytz is not None and tz is utc_pytz)
)
cdef bint is_zoneinfo(tzinfo tz):
- if ZoneInfo is None:
- return False
return isinstance(tz, ZoneInfo)
@@ -166,7 +158,7 @@ cpdef inline tzinfo maybe_get_tz(object tz):
elif tz == "UTC" or tz == "utc":
tz = utc_stdlib
else:
- tz = pytz.timezone(tz)
+ tz = zoneinfo.ZoneInfo(tz)
elif is_integer_object(tz):
tz = timezone(timedelta(seconds=tz))
elif isinstance(tz, tzinfo):
@@ -205,7 +197,7 @@ cdef object tz_cache_key(tzinfo tz):
the same tz file). Also, pytz objects are not always hashable so we use
str(tz) instead.
"""
- if isinstance(tz, _pytz_BaseTzInfo):
+ if pytz is not None and isinstance(tz, pytz.tzinfo.BaseTzInfo):
return tz.zone
elif isinstance(tz, _dateutil_tzfile):
if ".tar.gz" in tz._filename:
@@ -239,7 +231,7 @@ cpdef inline bint is_fixed_offset(tzinfo tz):
return 1
else:
return 0
- elif treat_tz_as_pytz(tz):
+ elif treat_tz_as_pytz(tz) and pytz is not None:
if (len(tz._transition_info) == 0
and len(tz._utc_transition_times) == 0):
return 1
diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx
index e3facd3d9599b..c100f315e9a19 100644
--- a/pandas/_libs/tslibs/tzconversion.pyx
+++ b/pandas/_libs/tslibs/tzconversion.pyx
@@ -15,7 +15,6 @@ from cython cimport Py_ssize_t
import_datetime()
import numpy as np
-import pytz
cimport numpy as cnp
from numpy cimport (
@@ -196,8 +195,8 @@ def tz_localize_to_utc(
NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns,
):
"""
- Localize tzinfo-naive i8 to given time zone (using pytz). If
- there are ambiguities in the values, raise AmbiguousTimeError.
+ Localize tzinfo-naive i8 to given time zone. If
+ there are ambiguities in the values, raise ValueError.
Parameters
----------
@@ -368,7 +367,7 @@ timedelta-like}
result[i] = NPY_NAT
else:
stamp = _render_tstamp(val, creso=creso)
- raise pytz.AmbiguousTimeError(
+ raise ValueError(
f"Cannot infer dst time from {stamp}, try using the "
"'ambiguous' argument"
)
@@ -428,7 +427,10 @@ timedelta-like}
result[i] = NPY_NAT
else:
stamp = _render_tstamp(val, creso=creso)
- raise pytz.NonExistentTimeError(stamp)
+ raise ValueError(
+ f"{stamp} is a nonexistent time due to daylight savings time. "
+ "Try using the 'nonexistent' argument."
+ )
return result.base # .base to get underlying ndarray
@@ -631,7 +633,7 @@ cdef ndarray[int64_t] _get_dst_hours(
if trans_idx.size == 1:
# see test_tz_localize_to_utc_ambiguous_infer
stamp = _render_tstamp(vals[trans_idx[0]], creso=creso)
- raise pytz.AmbiguousTimeError(
+ raise ValueError(
f"Cannot infer dst time from {stamp} as there "
"are no repeated times"
)
@@ -653,14 +655,16 @@ cdef ndarray[int64_t] _get_dst_hours(
if grp.size == 1 or np.all(delta > 0):
# see test_tz_localize_to_utc_ambiguous_infer
stamp = _render_tstamp(vals[grp[0]], creso=creso)
- raise pytz.AmbiguousTimeError(stamp)
+ raise ValueError(
+ f"{stamp} is an ambiguous time and cannot be inferred."
+ )
# Find the index for the switch and pull from a for dst and b
# for standard
switch_idxs = (delta <= 0).nonzero()[0]
if switch_idxs.size > 1:
# see test_tz_localize_to_utc_ambiguous_infer
- raise pytz.AmbiguousTimeError(
+ raise ValueError(
f"There are {switch_idxs.size} dst switches when "
"there should only be 1."
)
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
index 288559d386a71..756c209661fbb 100644
--- a/pandas/compat/__init__.py
+++ b/pandas/compat/__init__.py
@@ -33,6 +33,7 @@
pa_version_under14p1,
pa_version_under16p0,
pa_version_under17p0,
+ pa_version_under18p0,
)
if TYPE_CHECKING:
@@ -157,6 +158,7 @@ def is_ci_environment() -> bool:
"pa_version_under14p1",
"pa_version_under16p0",
"pa_version_under17p0",
+ "pa_version_under18p0",
"HAS_PYARROW",
"IS64",
"ISMUSL",
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 06082e71af32a..6b90389a62056 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -43,6 +43,7 @@
"pyreadstat": "1.2.0",
"pytest": "7.3.2",
"python-calamine": "0.1.7",
+ "pytz": "2023.4",
"pyxlsb": "1.0.10",
"s3fs": "2022.11.0",
"scipy": "1.10.0",
diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py
index ebfc0d69d9655..bd009b544f31e 100644
--- a/pandas/compat/pyarrow.py
+++ b/pandas/compat/pyarrow.py
@@ -17,6 +17,7 @@
pa_version_under15p0 = _palv < Version("15.0.0")
pa_version_under16p0 = _palv < Version("16.0.0")
pa_version_under17p0 = _palv < Version("17.0.0")
+ pa_version_under18p0 = _palv < Version("18.0.0")
HAS_PYARROW = True
except ImportError:
pa_version_under10p1 = True
@@ -28,4 +29,5 @@
pa_version_under15p0 = True
pa_version_under16p0 = True
pa_version_under17p0 = True
+ pa_version_under18p0 = True
HAS_PYARROW = False
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 7c485515f0784..d11213f1164bc 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -32,7 +32,10 @@
import gc
import operator
import os
-from typing import TYPE_CHECKING
+from typing import (
+ TYPE_CHECKING,
+ Any,
+)
import uuid
from dateutil.tz import (
@@ -43,11 +46,8 @@
from hypothesis import strategies as st
import numpy as np
import pytest
-from pytz import (
- FixedOffset,
- utc,
-)
+from pandas.compat._optional import import_optional_dependency
import pandas.util._test_decorators as td
from pandas.core.dtypes.dtypes import (
@@ -92,12 +92,7 @@
del pa
has_pyarrow = True
-import zoneinfo
-
-try:
- zoneinfo.ZoneInfo("UTC")
-except zoneinfo.ZoneInfoNotFoundError:
- zoneinfo = None # type: ignore[assignment]
+pytz = import_optional_dependency("pytz", errors="ignore")
# ----------------------------------------------------------------
@@ -1199,19 +1194,19 @@ def deco(*args):
"UTC-02:15",
tzutc(),
tzlocal(),
- FixedOffset(300),
- FixedOffset(0),
- FixedOffset(-300),
timezone.utc,
timezone(timedelta(hours=1)),
timezone(timedelta(hours=-1), name="foo"),
]
-if zoneinfo is not None:
+if pytz is not None:
TIMEZONES.extend(
- [
- zoneinfo.ZoneInfo("US/Pacific"), # type: ignore[list-item]
- zoneinfo.ZoneInfo("UTC"), # type: ignore[list-item]
- ]
+ (
+ pytz.FixedOffset(300),
+ pytz.FixedOffset(0),
+ pytz.FixedOffset(-300),
+ pytz.timezone("US/Pacific"),
+ pytz.timezone("UTC"),
+ )
)
TIMEZONE_IDS = [repr(i) for i in TIMEZONES]
@@ -1234,9 +1229,10 @@ def tz_aware_fixture(request):
return request.param
-_UTCS = ["utc", "dateutil/UTC", utc, tzutc(), timezone.utc]
-if zoneinfo is not None:
- _UTCS.append(zoneinfo.ZoneInfo("UTC"))
+_UTCS = ["utc", "dateutil/UTC", tzutc(), timezone.utc]
+
+if pytz is not None:
+ _UTCS.append(pytz.utc)
@pytest.fixture(params=_UTCS)
@@ -2046,12 +2042,12 @@ def using_infer_string() -> bool:
return pd.options.future.infer_string is True
-warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"]
-if zoneinfo is not None:
- warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type]
+_warsaws: list[Any] = ["Europe/Warsaw", "dateutil/Europe/Warsaw"]
+if pytz is not None:
+ _warsaws.append(pytz.timezone("Europe/Warsaw"))
-@pytest.fixture(params=warsaws)
+@pytest.fixture(params=_warsaws)
def warsaw(request) -> str:
"""
tzinfo for Europe/Warsaw using pytz, dateutil, or zoneinfo.
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 948836bf6a51d..56f8adda93251 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1529,9 +1529,7 @@ def safe_sort(
order2 = sorter.argsort()
if verify:
mask = (codes < -len(values)) | (codes >= len(values))
- codes[mask] = 0
- else:
- mask = None
+ codes[mask] = -1
new_codes = take_nd(order2, codes, fill_value=-1)
else:
reverse_indexer = np.empty(len(sorter), dtype=int)
@@ -1540,14 +1538,6 @@ def safe_sort(
# may deal with them here without performance loss using `mode='wrap'`
new_codes = reverse_indexer.take(codes, mode="wrap")
- if use_na_sentinel:
- mask = codes == -1
- if verify:
- mask = mask | (codes < -len(values)) | (codes >= len(values))
-
- if use_na_sentinel and mask is not None:
- np.putmask(new_codes, mask, -1)
-
return ordered, ensure_platform_int(new_codes)
diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
index 5c933294fb944..b2f78182b9bf0 100644
--- a/pandas/core/array_algos/quantile.py
+++ b/pandas/core/array_algos/quantile.py
@@ -94,9 +94,9 @@ def quantile_with_mask(
flat = np.array([fill_value] * len(qs))
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
else:
- result = _nanpercentile(
+ result = _nanquantile(
values,
- qs * 100.0,
+ qs,
na_value=fill_value,
mask=mask,
interpolation=interpolation,
@@ -108,7 +108,7 @@ def quantile_with_mask(
return result
-def _nanpercentile_1d(
+def _nanquantile_1d(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
qs: npt.NDArray[np.float64],
@@ -116,7 +116,7 @@ def _nanpercentile_1d(
interpolation: str,
) -> Scalar | np.ndarray:
"""
- Wrapper for np.percentile that skips missing values, specialized to
+ Wrapper for np.quantile that skips missing values, specialized to
1-dimensional case.
Parameters
@@ -142,7 +142,7 @@ def _nanpercentile_1d(
# equiv: 'np.array([na_value] * len(qs))' but much faster
return np.full(len(qs), na_value)
- return np.percentile(
+ return np.quantile(
values,
qs,
# error: No overload variant of "percentile" matches argument
@@ -152,7 +152,7 @@ def _nanpercentile_1d(
)
-def _nanpercentile(
+def _nanquantile(
values: np.ndarray,
qs: npt.NDArray[np.float64],
*,
@@ -161,7 +161,7 @@ def _nanpercentile(
interpolation: str,
):
"""
- Wrapper for np.percentile that skips missing values.
+ Wrapper for np.quantile that skips missing values.
Parameters
----------
@@ -180,7 +180,7 @@ def _nanpercentile(
if values.dtype.kind in "mM":
# need to cast to integer to avoid rounding errors in numpy
- result = _nanpercentile(
+ result = _nanquantile(
values.view("i8"),
qs=qs,
na_value=na_value.view("i8"),
@@ -196,7 +196,7 @@ def _nanpercentile(
# Caller is responsible for ensuring mask shape match
assert mask.shape == values.shape
result = [
- _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
+ _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation)
for (val, m) in zip(list(values), list(mask))
]
if values.dtype.kind == "f":
@@ -215,7 +215,7 @@ def _nanpercentile(
result = result.astype(values.dtype, copy=False)
return result
else:
- return np.percentile(
+ return np.quantile(
values,
qs,
axis=1,
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index d07bfeda50e1d..e95fa441e18fb 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -709,7 +709,13 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray:
if isinstance(
other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)
) or isinstance(getattr(other, "dtype", None), CategoricalDtype):
- result = pc_func(self._pa_array, self._box_pa(other))
+ try:
+ result = pc_func(self._pa_array, self._box_pa(other))
+ except pa.ArrowNotImplementedError:
+ # TODO: could this be wrong if other is object dtype?
+ # in which case we need to operate pointwise?
+ result = ops.invalid_comparison(self, other, op)
+ result = pa.array(result, type=pa.bool_())
elif is_scalar(other):
try:
result = pc_func(self._pa_array, self._box_pa(other))
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index ad0bde3abbdd4..fbe1677b95b33 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -19,6 +19,7 @@
import numpy as np
+from pandas._config import using_string_dtype
from pandas._config.config import get_option
from pandas._libs import (
@@ -1759,6 +1760,10 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:
dtype='object')
"""
result = self._format_native_types(date_format=date_format, na_rep=np.nan)
+ if using_string_dtype():
+ from pandas import StringDtype
+
+ return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value]
return result.astype(object, copy=False)
@@ -1781,7 +1786,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:
a non-DST time (note that this flag is only applicable for
ambiguous times)
- 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous
+ - 'raise' will raise a ValueError if there are ambiguous
times.
nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise'
@@ -1794,7 +1799,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:
closest existing time
- 'NaT' will return NaT where there are nonexistent times
- timedelta objects will shift nonexistent times by the timedelta
- - 'raise' will raise an NonExistentTimeError if there are
+ - 'raise' will raise a ValueError if there are
nonexistent times.
Returns
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index dddfc440109d3..201c449185057 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -15,6 +15,7 @@
import numpy as np
+from pandas._config import using_string_dtype
from pandas._config.config import get_option
from pandas._libs import (
@@ -158,15 +159,8 @@ def f(self):
# these return a boolean by-definition
return result
- if field in self._object_ops:
- result = fields.get_date_name_field(values, field, reso=self._creso)
- result = self._maybe_mask_results(result, fill_value=None)
-
- else:
- result = fields.get_date_field(values, field, reso=self._creso)
- result = self._maybe_mask_results(
- result, fill_value=None, convert="float64"
- )
+ result = fields.get_date_field(values, field, reso=self._creso)
+ result = self._maybe_mask_results(result, fill_value=None, convert="float64")
return result
@@ -243,7 +237,6 @@ def _scalar_type(self) -> type[Timestamp]:
"is_year_end",
"is_leap_year",
]
- _object_ops: list[str] = ["freq", "tz"]
_field_ops: list[str] = [
"year",
"month",
@@ -264,7 +257,7 @@ def _scalar_type(self) -> type[Timestamp]:
]
_other_ops: list[str] = ["date", "time", "timetz"]
_datetimelike_ops: list[str] = (
- _field_ops + _object_ops + _bool_ops + _other_ops + ["unit"]
+ _field_ops + _bool_ops + _other_ops + ["unit", "freq", "tz"]
)
_datetimelike_methods: list[str] = [
"to_period",
@@ -972,7 +965,7 @@ def tz_localize(
non-DST time (note that this flag is only applicable for
ambiguous times)
- 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous
+ - 'raise' will raise a ValueError if there are ambiguous
times.
nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \
@@ -986,7 +979,7 @@ def tz_localize(
closest existing time
- 'NaT' will return NaT where there are nonexistent times
- timedelta objects will shift nonexistent times by the timedelta
- - 'raise' will raise an NonExistentTimeError if there are
+ - 'raise' will raise a ValueError if there are
nonexistent times.
Returns
@@ -1340,6 +1333,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]:
values, "month_name", locale=locale, reso=self._creso
)
result = self._maybe_mask_results(result, fill_value=None)
+ if using_string_dtype():
+ from pandas import (
+ StringDtype,
+ array as pd_array,
+ )
+
+ return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value]
return result
def day_name(self, locale=None) -> npt.NDArray[np.object_]:
@@ -1401,6 +1401,14 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]:
values, "day_name", locale=locale, reso=self._creso
)
result = self._maybe_mask_results(result, fill_value=None)
+ if using_string_dtype():
+ # TODO: no tests that check for dtype of result as of 2024-08-15
+ from pandas import (
+ StringDtype,
+ array as pd_array,
+ )
+
+ return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value]
return result
@property
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 07eb91e0cb13b..03712f75db0c7 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -557,7 +557,3 @@ def _wrap_ndarray_result(self, result: np.ndarray):
return TimedeltaArray._simple_new(result, dtype=result.dtype)
return type(self)(result)
-
- # ------------------------------------------------------------------------
- # String methods interface
- _str_na_value = np.nan
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 8a4fd9fc1b34d..823084c3e9982 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -140,12 +140,16 @@ def __init__(
# infer defaults
if storage is None:
if na_value is not libmissing.NA:
- if HAS_PYARROW:
- storage = "pyarrow"
- else:
- storage = "python"
+ storage = get_option("mode.string_storage")
+ if storage == "auto":
+ if HAS_PYARROW:
+ storage = "pyarrow"
+ else:
+ storage = "python"
else:
storage = get_option("mode.string_storage")
+ if storage == "auto":
+ storage = "python"
if storage == "pyarrow_numpy":
# TODO raise a deprecation warning
@@ -346,6 +350,55 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
raise ValueError
return cls._from_sequence(scalars, dtype=dtype)
+ def _str_map(
+ self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
+ ):
+ if self.dtype.na_value is np.nan:
+ return self._str_map_nan_semantics(f, na_value=na_value, dtype=dtype)
+
+ from pandas.arrays import BooleanArray
+
+ if dtype is None:
+ dtype = self.dtype
+ if na_value is None:
+ na_value = self.dtype.na_value
+
+ mask = isna(self)
+ arr = np.asarray(self)
+
+ if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+ constructor: type[IntegerArray | BooleanArray]
+ if is_integer_dtype(dtype):
+ constructor = IntegerArray
+ else:
+ constructor = BooleanArray
+
+ na_value_is_na = isna(na_value)
+ if na_value_is_na:
+ na_value = 1
+ elif dtype == np.dtype("bool"):
+ # GH#55736
+ na_value = bool(na_value)
+ result = lib.map_infer_mask(
+ arr,
+ f,
+ mask.view("uint8"),
+ convert=False,
+ na_value=na_value,
+ # error: Argument 1 to "dtype" has incompatible type
+ # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
+ # "Type[object]"
+ dtype=np.dtype(cast(type, dtype)),
+ )
+
+ if not na_value_is_na:
+ mask[:] = False
+
+ return constructor(result, mask)
+
+ else:
+ return self._str_map_str_or_object(dtype, na_value, arr, f, mask)
+
def _str_map_str_or_object(
self,
dtype,
@@ -353,7 +406,6 @@ def _str_map_str_or_object(
arr: np.ndarray,
f,
mask: npt.NDArray[np.bool_],
- convert: bool,
):
# _str_map helper for case where dtype is either string dtype or object
if is_string_dtype(dtype) and not is_object_dtype(dtype):
@@ -377,6 +429,45 @@ def _str_map_str_or_object(
# -> We don't know the result type. E.g. `.get` can return anything.
return lib.map_infer_mask(arr, f, mask.view("uint8"))
+ def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None):
+ if dtype is None:
+ dtype = self.dtype
+ if na_value is None:
+ na_value = self.dtype.na_value
+
+ mask = isna(self)
+ arr = np.asarray(self)
+
+ if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+ na_value_is_na = isna(na_value)
+ if na_value_is_na:
+ if is_integer_dtype(dtype):
+ na_value = 0
+ else:
+ na_value = True
+
+ result = lib.map_infer_mask(
+ arr,
+ f,
+ mask.view("uint8"),
+ convert=False,
+ na_value=na_value,
+ dtype=np.dtype(cast(type, dtype)),
+ )
+ if na_value_is_na and mask.any():
+ # TODO: we could alternatively do this check before map_infer_mask
+ # and adjust the dtype/na_value we pass there. Which is more
+ # performant?
+ if is_integer_dtype(dtype):
+ result = result.astype("float64")
+ else:
+ result = result.astype("object")
+ result[mask] = np.nan
+ return result
+
+ else:
+ return self._str_map_str_or_object(dtype, na_value, arr, f, mask)
+
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
# incompatible with definition in base class "ExtensionArray"
@@ -655,6 +746,12 @@ def _reduce(
axis: AxisInt | None = 0,
**kwargs,
):
+ if self.dtype.na_value is np.nan and name in ["any", "all"]:
+ if name == "any":
+ return nanops.nanany(self._ndarray, skipna=skipna)
+ else:
+ return nanops.nanall(self._ndarray, skipna=skipna)
+
if name in ["min", "max"]:
result = getattr(self, name)(skipna=skipna, axis=axis)
if keepdims:
@@ -663,6 +760,12 @@ def _reduce(
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
+ def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
+ if self.dtype.na_value is np.nan and result is libmissing.NA:
+ # the masked_reductions use pd.NA -> convert to np.nan
+ return np.nan
+ return super()._wrap_reduction_result(axis, result)
+
def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
nv.validate_min((), kwargs)
result = masked_reductions.min(
@@ -680,8 +783,11 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
def value_counts(self, dropna: bool = True) -> Series:
from pandas.core.algorithms import value_counts_internal as value_counts
- result = value_counts(self._ndarray, sort=False, dropna=dropna).astype("Int64")
+ result = value_counts(self._ndarray, sort=False, dropna=dropna)
result.index = result.index.astype(self.dtype)
+
+ if self.dtype.na_value is libmissing.NA:
+ result = result.astype("Int64")
return result
def memory_usage(self, deep: bool = False) -> int:
@@ -732,104 +838,15 @@ def _cmp_method(self, other, op):
# logical
result = np.zeros(len(self._ndarray), dtype="bool")
result[valid] = op(self._ndarray[valid], other)
- return BooleanArray(result, mask)
-
- _arith_method = _cmp_method
-
- # ------------------------------------------------------------------------
- # String methods interface
- # error: Incompatible types in assignment (expression has type "NAType",
- # base class "NumpyExtensionArray" defined the type as "float")
- _str_na_value = libmissing.NA # type: ignore[assignment]
-
- def _str_map_nan_semantics(
- self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
- ):
- if dtype is None:
- dtype = self.dtype
- if na_value is None:
- na_value = self.dtype.na_value
-
- mask = isna(self)
- arr = np.asarray(self)
- convert = convert and not np.all(mask)
-
- if is_integer_dtype(dtype) or is_bool_dtype(dtype):
- na_value_is_na = isna(na_value)
- if na_value_is_na:
- if is_integer_dtype(dtype):
- na_value = 0
+ res_arr = BooleanArray(result, mask)
+ if self.dtype.na_value is np.nan:
+ if op == operator.ne:
+ return res_arr.to_numpy(np.bool_, na_value=True)
else:
- na_value = True
+ return res_arr.to_numpy(np.bool_, na_value=False)
+ return res_arr
- result = lib.map_infer_mask(
- arr,
- f,
- mask.view("uint8"),
- convert=False,
- na_value=na_value,
- dtype=np.dtype(cast(type, dtype)),
- )
- if na_value_is_na and mask.any():
- if is_integer_dtype(dtype):
- result = result.astype("float64")
- else:
- result = result.astype("object")
- result[mask] = np.nan
- return result
-
- else:
- return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
-
- def _str_map(
- self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
- ):
- if self.dtype.na_value is np.nan:
- return self._str_map_nan_semantics(
- f, na_value=na_value, dtype=dtype, convert=convert
- )
-
- from pandas.arrays import BooleanArray
-
- if dtype is None:
- dtype = StringDtype(storage="python")
- if na_value is None:
- na_value = self.dtype.na_value
-
- mask = isna(self)
- arr = np.asarray(self)
-
- if is_integer_dtype(dtype) or is_bool_dtype(dtype):
- constructor: type[IntegerArray | BooleanArray]
- if is_integer_dtype(dtype):
- constructor = IntegerArray
- else:
- constructor = BooleanArray
-
- na_value_is_na = isna(na_value)
- if na_value_is_na:
- na_value = 1
- elif dtype == np.dtype("bool"):
- na_value = bool(na_value)
- result = lib.map_infer_mask(
- arr,
- f,
- mask.view("uint8"),
- convert=False,
- na_value=na_value,
- # error: Argument 1 to "dtype" has incompatible type
- # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
- # "Type[object]"
- dtype=np.dtype(cast(type, dtype)),
- )
-
- if not na_value_is_na:
- mask[:] = False
-
- return constructor(result, mask)
-
- else:
- return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
+ _arith_method = _cmp_method
class StringArrayNumpySemantics(StringArray):
@@ -861,38 +878,3 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:
# need to override NumpyExtensionArray._from_backing_data to ensure
# we always preserve the dtype
return NDArrayBacked._from_backing_data(self, arr)
-
- def _reduce(
- self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
- ):
- if name in ["any", "all"]:
- if name == "any":
- return nanops.nanany(self._ndarray, skipna=skipna)
- else:
- return nanops.nanall(self._ndarray, skipna=skipna)
- else:
- return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
-
- def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
- # the masked_reductions use pd.NA
- if result is libmissing.NA:
- return np.nan
- return super()._wrap_reduction_result(axis, result)
-
- def _cmp_method(self, other, op):
- result = super()._cmp_method(other, op)
- if op == operator.ne:
- return result.to_numpy(np.bool_, na_value=True)
- else:
- return result.to_numpy(np.bool_, na_value=False)
-
- def value_counts(self, dropna: bool = True) -> Series:
- from pandas.core.algorithms import value_counts_internal as value_counts
-
- result = value_counts(self._ndarray, sort=False, dropna=dropna)
- result.index = result.index.astype(self.dtype)
- return result
-
- # ------------------------------------------------------------------------
- # String methods interface
- _str_na_value = np.nan
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4893883d3ad12..67114815341b6 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,12 +1,10 @@
from __future__ import annotations
-from functools import partial
import operator
import re
from typing import (
TYPE_CHECKING,
Union,
- cast,
)
import numpy as np
@@ -23,8 +21,6 @@
)
from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_integer_dtype,
is_scalar,
pandas_dtype,
)
@@ -39,7 +35,6 @@
BaseStringArray,
StringDtype,
)
-from pandas.core.ops import invalid_comparison
from pandas.core.strings.object_array import ObjectStringArrayMixin
if not pa_version_under10p1:
@@ -133,18 +128,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
def __init__(self, values) -> None:
_chk_pyarrow_available()
- if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
- values.type
+ if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
+ pa.types.is_string(values.type)
+ or (
+ pa.types.is_dictionary(values.type)
+ and (
+ pa.types.is_string(values.type.value_type)
+ or pa.types.is_large_string(values.type.value_type)
+ )
+ )
):
values = pc.cast(values, pa.large_string())
super().__init__(values)
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
- if not pa.types.is_large_string(self._pa_array.type) and not (
- pa.types.is_dictionary(self._pa_array.type)
- and pa.types.is_large_string(self._pa_array.type.value_type)
- ):
+ if not pa.types.is_large_string(self._pa_array.type):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of "
"large_string type"
@@ -216,12 +215,17 @@ def dtype(self) -> StringDtype: # type: ignore[override]
return self._dtype
def insert(self, loc: int, item) -> ArrowStringArray:
+ if self.dtype.na_value is np.nan and item is np.nan:
+ item = libmissing.NA
if not isinstance(item, str) and item is not libmissing.NA:
raise TypeError("Scalar must be NA or str")
return super().insert(loc, item)
- @classmethod
- def _result_converter(cls, values, na=None):
+ def _result_converter(self, values, na=None):
+ if self.dtype.na_value is np.nan:
+ if not isna(na):
+ values = values.fill_null(bool(na))
+ return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
return BooleanDtype().__from_arrow__(values)
def _maybe_convert_setitem_value(self, value):
@@ -275,102 +279,7 @@ def astype(self, dtype, copy: bool = True):
# ------------------------------------------------------------------------
# String methods interface
- # error: Incompatible types in assignment (expression has type "NAType",
- # base class "ObjectStringArrayMixin" defined the type as "float")
- _str_na_value = libmissing.NA # type: ignore[assignment]
-
- def _str_map_nan_semantics(
- self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
- ):
- if dtype is None:
- dtype = self.dtype
- if na_value is None:
- na_value = self.dtype.na_value
-
- mask = isna(self)
- arr = np.asarray(self)
-
- if is_integer_dtype(dtype) or is_bool_dtype(dtype):
- if is_integer_dtype(dtype):
- na_value = np.nan
- else:
- na_value = False
-
- dtype = np.dtype(cast(type, dtype))
- if mask.any():
- # numpy int/bool dtypes cannot hold NaNs so we must convert to
- # float64 for int (to match maybe_convert_objects) or
- # object for bool (again to match maybe_convert_objects)
- if is_integer_dtype(dtype):
- dtype = np.dtype("float64")
- else:
- dtype = np.dtype(object)
- result = lib.map_infer_mask(
- arr,
- f,
- mask.view("uint8"),
- convert=False,
- na_value=na_value,
- dtype=dtype,
- )
- return result
-
- else:
- return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
-
- def _str_map(
- self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
- ):
- if self.dtype.na_value is np.nan:
- return self._str_map_nan_semantics(
- f, na_value=na_value, dtype=dtype, convert=convert
- )
-
- # TODO: de-duplicate with StringArray method. This method is moreless copy and
- # paste.
-
- from pandas.arrays import (
- BooleanArray,
- IntegerArray,
- )
-
- if dtype is None:
- dtype = self.dtype
- if na_value is None:
- na_value = self.dtype.na_value
-
- mask = isna(self)
- arr = np.asarray(self)
-
- if is_integer_dtype(dtype) or is_bool_dtype(dtype):
- constructor: type[IntegerArray | BooleanArray]
- if is_integer_dtype(dtype):
- constructor = IntegerArray
- else:
- constructor = BooleanArray
-
- na_value_is_na = isna(na_value)
- if na_value_is_na:
- na_value = 1
- result = lib.map_infer_mask(
- arr,
- f,
- mask.view("uint8"),
- convert=False,
- na_value=na_value,
- # error: Argument 1 to "dtype" has incompatible type
- # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
- # "Type[object]"
- dtype=np.dtype(cast(type, dtype)),
- )
-
- if not na_value_is_na:
- mask[:] = False
-
- return constructor(result, mask)
-
- else:
- return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
+ _str_map = BaseStringArray._str_map
def _str_contains(
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
@@ -587,11 +496,30 @@ def _str_get_dummies(self, sep: str = "|"):
return dummies.astype(np.int64, copy=False), labels
def _convert_int_dtype(self, result):
+ if self.dtype.na_value is np.nan:
+ if isinstance(result, pa.Array):
+ result = result.to_numpy(zero_copy_only=False)
+ else:
+ result = result.to_numpy()
+ if result.dtype == np.int32:
+ result = result.astype(np.int64)
+ return result
+
return Int64Dtype().__from_arrow__(result)
def _reduce(
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
):
+ if self.dtype.na_value is np.nan and name in ["any", "all"]:
+ if not skipna:
+ nas = pc.is_null(self._pa_array)
+ arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
+ else:
+ arr = pc.not_equal(self._pa_array, "")
+ return ArrowExtensionArray(arr)._reduce(
+ name, skipna=skipna, keepdims=keepdims, **kwargs
+ )
+
result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
if name in ("argmin", "argmax") and isinstance(result, pa.Array):
return self._convert_int_dtype(result)
@@ -622,70 +550,31 @@ def _rank(
)
)
-
-class ArrowStringArrayNumpySemantics(ArrowStringArray):
- _storage = "pyarrow"
- _na_value = np.nan
-
- @classmethod
- def _result_converter(cls, values, na=None):
- if not isna(na):
- values = values.fill_null(bool(na))
- return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
-
- def __getattribute__(self, item):
- # ArrowStringArray and we both inherit from ArrowExtensionArray, which
- # creates inheritance problems (Diamond inheritance)
- if item in ArrowStringArrayMixin.__dict__ and item not in (
- "_pa_array",
- "__dict__",
- ):
- return partial(getattr(ArrowStringArrayMixin, item), self)
- return super().__getattribute__(item)
-
- def _convert_int_dtype(self, result):
- if isinstance(result, pa.Array):
- result = result.to_numpy(zero_copy_only=False)
- else:
- result = result.to_numpy()
- if result.dtype == np.int32:
- result = result.astype(np.int64)
+ def value_counts(self, dropna: bool = True) -> Series:
+ result = super().value_counts(dropna=dropna)
+ if self.dtype.na_value is np.nan:
+ res_values = result._values.to_numpy()
+ return result._constructor(
+ res_values, index=result.index, name=result.name, copy=False
+ )
return result
def _cmp_method(self, other, op):
- try:
- result = super()._cmp_method(other, op)
- except pa.ArrowNotImplementedError:
- return invalid_comparison(self, other, op)
- if op == operator.ne:
- return result.to_numpy(np.bool_, na_value=True)
- else:
- return result.to_numpy(np.bool_, na_value=False)
-
- def value_counts(self, dropna: bool = True) -> Series:
- from pandas import Series
-
- result = super().value_counts(dropna)
- return Series(
- result._values.to_numpy(), index=result.index, name=result.name, copy=False
- )
-
- def _reduce(
- self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
- ):
- if name in ["any", "all"]:
- if not skipna:
- nas = pc.is_null(self._pa_array)
- arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
+ result = super()._cmp_method(other, op)
+ if self.dtype.na_value is np.nan:
+ if op == operator.ne:
+ return result.to_numpy(np.bool_, na_value=True)
else:
- arr = pc.not_equal(self._pa_array, "")
- return ArrowExtensionArray(arr)._reduce(
- name, skipna=skipna, keepdims=keepdims, **kwargs
- )
- else:
- return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
+ return result.to_numpy(np.bool_, na_value=False)
+ return result
- def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics:
- if item is np.nan:
- item = libmissing.NA
- return super().insert(loc, item) # type: ignore[return-value]
+
+class ArrowStringArrayNumpySemantics(ArrowStringArray):
+ _na_value = np.nan
+ _str_get = ArrowStringArrayMixin._str_get
+ _str_removesuffix = ArrowStringArrayMixin._str_removesuffix
+ _str_capitalize = ArrowStringArrayMixin._str_capitalize
+ _str_pad = ArrowStringArrayMixin._str_pad
+ _str_title = ArrowStringArrayMixin._str_title
+ _str_swapcase = ArrowStringArrayMixin._str_swapcase
+ _str_slice_replace = ArrowStringArrayMixin._str_slice_replace
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index 83cc2871f5459..b2cfbe7338c0d 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -152,9 +152,8 @@ def _scalar_type(self) -> type[Timedelta]:
# define my properties & methods for delegation
_other_ops: list[str] = []
_bool_ops: list[str] = []
- _object_ops: list[str] = ["freq"]
_field_ops: list[str] = ["days", "seconds", "microseconds", "nanoseconds"]
- _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + ["unit"]
+ _datetimelike_ops: list[str] = _field_ops + _bool_ops + ["unit", "freq"]
_datetimelike_methods: list[str] = [
"to_pytimedelta",
"total_seconds",
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index e62cda0dfe8d0..e4eefb570fd95 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -452,13 +452,12 @@ def is_terminal() -> bool:
string_storage_doc = """
: string
- The default storage for StringDtype. This option is ignored if
- ``future.infer_string`` is set to True.
+ The default storage for StringDtype.
"""
def is_valid_string_storage(value: Any) -> None:
- legal_values = ["python", "pyarrow"]
+ legal_values = ["auto", "python", "pyarrow"]
if value not in legal_values:
msg = "Value must be one of python|pyarrow"
if value == "pyarrow_numpy":
@@ -473,7 +472,7 @@ def is_valid_string_storage(value: Any) -> None:
with cf.config_prefix("mode"):
cf.register_option(
"string_storage",
- "python",
+ "auto",
string_storage_doc,
# validator=is_one_of_factory(["python", "pyarrow"]),
validator=is_valid_string_storage,
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 162f6a4d30f3f..3394bf091e228 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1014,10 +1014,8 @@ def convert_dtypes(
Back-end data type applied to the resultant :class:`DataFrame`
(still experimental). Behaviour is as follows:
- * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
+ * ``"numpy_nullable"``: returns nullable-dtype
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
.. versionadded:: 2.0
@@ -1025,6 +1023,8 @@ def convert_dtypes(
-------
np.dtype, or ExtensionDtype
"""
+ from pandas.core.arrays.string_ import StringDtype
+
inferred_dtype: str | DtypeObj
if (
@@ -1103,6 +1103,13 @@ def convert_dtypes(
# If we couldn't do anything else, then we retain the dtype
inferred_dtype = input_array.dtype
+ elif (
+ convert_string
+ and isinstance(input_array.dtype, StringDtype)
+ and input_array.dtype.na_value is np.nan
+ ):
+ inferred_dtype = pandas_dtype_func("string")
+
else:
inferred_dtype = input_array.dtype
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 64b5278424192..bcf1ade9b0320 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1274,6 +1274,10 @@ def is_bool_dtype(arr_or_dtype) -> bool:
"""
Check whether the provided array or dtype is of a boolean dtype.
+ This function verifies whether a given object is a boolean data type. The input
+ can be an array or a dtype object. Accepted array types include instances
+ of ``np.array``, ``pd.Series``, ``pd.Index``, and similar array-like structures.
+
Parameters
----------
arr_or_dtype : array-like or dtype
@@ -1284,6 +1288,10 @@ def is_bool_dtype(arr_or_dtype) -> bool:
boolean
Whether or not the array or dtype is of a boolean dtype.
+ See Also
+ --------
+ api.types.is_bool : Check if an object is a boolean.
+
Notes
-----
An ExtensionArray is considered boolean when the ``_is_boolean``
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index 3aeab96e03163..c0587d36bcb5a 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -18,9 +18,9 @@
cast,
)
import warnings
+import zoneinfo
import numpy as np
-import pytz
from pandas._config.config import get_option
@@ -789,7 +789,7 @@ def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None) -> None:
tz = timezones.maybe_get_tz(tz)
tz = timezones.tz_standardize(tz)
elif tz is not None:
- raise pytz.UnknownTimeZoneError(tz)
+ raise zoneinfo.ZoneInfoNotFoundError(tz)
if tz is None:
raise TypeError("A 'tz' is required.")
@@ -882,7 +882,7 @@ def construct_from_string(cls, string: str_type) -> DatetimeTZDtype:
return cls(unit=d["unit"], tz=d["tz"])
except (KeyError, TypeError, ValueError) as err:
# KeyError if maybe_get_tz tries and fails to get a
- # pytz timezone (actually pytz.UnknownTimeZoneError).
+ # zoneinfo timezone (actually zoneinfo.ZoneInfoNotFoundError).
# TypeError if we pass a nonsense tz;
# ValueError if we pass a unit other than "ns"
raise TypeError(msg) from err
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index b8039746d9952..1e6608b0d87f3 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -6406,7 +6406,7 @@ def dropna(
thresh : int, optional
Require that many non-NA values. Cannot be combined with how.
- subset : column label or sequence of labels, optional
+ subset : column label or iterable of labels, optional
Labels along other axis to consider, e.g. if you are dropping rows
these would be a list of columns to include.
inplace : bool, default False
@@ -6536,7 +6536,7 @@ def dropna(
@overload
def drop_duplicates(
self,
- subset: Hashable | Sequence[Hashable] | None = ...,
+ subset: Hashable | Iterable[Hashable] | None = ...,
*,
keep: DropKeep = ...,
inplace: Literal[True],
@@ -6546,7 +6546,7 @@ def drop_duplicates(
@overload
def drop_duplicates(
self,
- subset: Hashable | Sequence[Hashable] | None = ...,
+ subset: Hashable | Iterable[Hashable] | None = ...,
*,
keep: DropKeep = ...,
inplace: Literal[False] = ...,
@@ -6556,7 +6556,7 @@ def drop_duplicates(
@overload
def drop_duplicates(
self,
- subset: Hashable | Sequence[Hashable] | None = ...,
+ subset: Hashable | Iterable[Hashable] | None = ...,
*,
keep: DropKeep = ...,
inplace: bool = ...,
@@ -6565,7 +6565,7 @@ def drop_duplicates(
def drop_duplicates(
self,
- subset: Hashable | Sequence[Hashable] | None = None,
+ subset: Hashable | Iterable[Hashable] | None = None,
*,
keep: DropKeep = "first",
inplace: bool = False,
@@ -6579,7 +6579,7 @@ def drop_duplicates(
Parameters
----------
- subset : column label or sequence of labels, optional
+ subset : column label or iterable of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : {'first', 'last', ``False``}, default 'first'
@@ -6669,7 +6669,7 @@ def drop_duplicates(
def duplicated(
self,
- subset: Hashable | Sequence[Hashable] | None = None,
+ subset: Hashable | Iterable[Hashable] | None = None,
keep: DropKeep = "first",
) -> Series:
"""
@@ -6679,7 +6679,7 @@ def duplicated(
Parameters
----------
- subset : column label or sequence of labels, optional
+ subset : column label or iterable of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : {'first', 'last', False}, default 'first'
@@ -6771,10 +6771,7 @@ def f(vals) -> tuple[np.ndarray, int]:
return labels.astype("i8"), len(shape)
if subset is None:
- # https://github.com/pandas-dev/pandas/issues/28770
- # Incompatible types in assignment (expression has type "Index", variable
- # has type "Sequence[Any]")
- subset = self.columns # type: ignore[assignment]
+ subset = self.columns
elif (
not np.iterable(subset)
or isinstance(subset, str)
@@ -6795,7 +6792,7 @@ def f(vals) -> tuple[np.ndarray, int]:
if len(subset) == 1 and self.columns.is_unique:
# GH#45236 This is faster than get_group_index below
- result = self[subset[0]].duplicated(keep)
+ result = self[next(iter(subset))].duplicated(keep)
result.name = None
else:
vals = (col.values for name, col in self.items() if name in subset)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 8a6fc69d47cc3..0f0078fc3398b 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -6670,10 +6670,10 @@ def convert_dtypes(
Back-end data type applied to the resultant :class:`DataFrame` or
:class:`Series` (still experimental). Behaviour is as follows:
- * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- or :class:`Series` (default).
+ * ``"numpy_nullable"``: returns nullable-dtype-backed
+ :class:`DataFrame` or :class:`Serires`.
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame or Series.
+ :class:`DataFrame` or :class:`Series`.
.. versionadded:: 2.0
@@ -10570,7 +10570,7 @@ def tz_localize(
a non-DST time (note that this flag is only applicable for
ambiguous times)
- 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous
+ - 'raise' will raise a ValueError if there are ambiguous
times.
nonexistent : str, default 'raise'
A nonexistent time does not exist in a particular timezone
@@ -10582,7 +10582,7 @@ def tz_localize(
closest existing time
- 'NaT' will return NaT where there are nonexistent times
- timedelta objects will shift nonexistent times by the timedelta
- - 'raise' will raise an NonExistentTimeError if there are
+ - 'raise' will raise a ValueError if there are
nonexistent times.
Returns
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 00a929724ed4c..3b3cda8f7cd33 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -6,7 +6,6 @@
import warnings
import numpy as np
-import pytz
from pandas._libs import (
NaT,
@@ -162,7 +161,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin):
non-DST time (note that this flag is only applicable for ambiguous
times)
- 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous times.
+ - 'raise' will raise a ValueError if there are ambiguous times.
dayfirst : bool, default False
If True, parse dates in `data` with the day first order.
yearfirst : bool, default False
@@ -264,7 +263,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]:
@doc(DatetimeArray.strftime)
def strftime(self, date_format) -> Index:
arr = self._data.strftime(date_format)
- return Index(arr, name=self.name, dtype=object)
+ return Index(arr, name=self.name, dtype=arr.dtype)
@doc(DatetimeArray.tz_convert)
def tz_convert(self, tz) -> Self:
@@ -591,7 +590,7 @@ def get_loc(self, key):
elif isinstance(key, str):
try:
parsed, reso = self._parse_with_reso(key)
- except (ValueError, pytz.NonExistentTimeError) as err:
+ except ValueError as err:
raise KeyError(key) from err
self._disallow_mismatched_indexing(parsed)
diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py
index 48d5e59250f35..2eeacfb769be4 100644
--- a/pandas/core/indexes/extension.py
+++ b/pandas/core/indexes/extension.py
@@ -74,7 +74,7 @@ def fget(self):
return type(self)._simple_new(result, name=self.name)
elif isinstance(result, ABCDataFrame):
return result.set_index(self)
- return Index(result, name=self.name)
+ return Index(result, name=self.name, dtype=result.dtype)
return result
def fset(self, value) -> None:
@@ -101,7 +101,7 @@ def method(self, *args, **kwargs): # type: ignore[misc]
return type(self)._simple_new(result, name=self.name)
elif isinstance(result, ABCDataFrame):
return result.set_index(self)
- return Index(result, name=self.name)
+ return Index(result, name=self.name, dtype=result.dtype)
return result
# error: "property" has no attribute "__name__"
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 0900121ab717f..c3d4ad721c830 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -1636,6 +1636,17 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
doc="""
Names of levels in MultiIndex.
+ This attribute provides access to the names of the levels in a `MultiIndex`.
+ The names are stored as a `FrozenList`, which is an immutable list-like
+ container. Each name corresponds to a level in the `MultiIndex`, and can be
+ used to identify or manipulate the levels individually.
+
+ See Also
+ --------
+ MultiIndex.set_names : Set Index or MultiIndex name.
+ MultiIndex.rename : Rename specific levels in a MultiIndex.
+ Index.names : Get names on index.
+
Examples
--------
>>> mi = pd.MultiIndex.from_arrays(
@@ -2681,8 +2692,15 @@ def sortlevel(
"""
Sort MultiIndex at the requested level.
- The result will respect the original ordering of the associated
- factor at that level.
+ This method is useful when dealing with MultiIndex objects, allowing for
+ sorting at a specific level of the index. The function preserves the
+ relative ordering of data within the same level while sorting
+ the overall MultiIndex. The method provides flexibility with the `ascending`
+ parameter to define the sort order and with the `sort_remaining` parameter to
+ control whether the remaining levels should also be sorted. Sorting a
+ MultiIndex can be crucial when performing operations that require ordered
+ indices, such as grouping or merging datasets. The `na_position` argument is
+ important in handling missing values consistently across different levels.
Parameters
----------
@@ -2692,7 +2710,9 @@ def sortlevel(
ascending : bool, default True
False to sort in descending order.
Can also be a list to specify a directed ordering.
- sort_remaining : sort by the remaining levels after level
+ sort_remaining : bool, default True
+ If True, sorts by the remaining levels after sorting by the specified
+ `level`.
na_position : {'first' or 'last'}, default 'first'
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
the end.
@@ -2706,6 +2726,13 @@ def sortlevel(
indexer : np.ndarray[np.intp]
Indices of output values in original index.
+ See Also
+ --------
+ MultiIndex : A multi-level, or hierarchical, index object for pandas objects.
+ Index.sort_values : Sort Index values.
+ DataFrame.sort_index : Sort DataFrame by the index.
+ Series.sort_index : Sort Series by the index.
+
Examples
--------
>>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]])
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 149bef6258bfa..dfb96162f0ac1 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -512,7 +512,11 @@ def convert(self) -> list[Block]:
convert_non_numeric=True,
)
refs = None
- if res_values is values:
+ if (
+ res_values is values
+ or isinstance(res_values, NumpyExtensionArray)
+ and res_values._ndarray is values
+ ):
refs = self.refs
res_values = ensure_block_shape(res_values, self.ndim)
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index 6836ba3f65691..c005a1ce26e4b 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -379,6 +379,11 @@ def concat(
0 1 2
1 3 4
"""
+ if ignore_index and keys is not None:
+ raise ValueError(
+ f"Cannot set {ignore_index=} and specify keys. Either should be used."
+ )
+
if copy is not lib.no_default:
warnings.warn(
"The copy keyword is deprecated and will be removed in a future "
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
index 18517199f073c..b3f946f289891 100644
--- a/pandas/core/reshape/tile.py
+++ b/pandas/core/reshape/tile.py
@@ -358,7 +358,16 @@ def qcut(
x_idx = _preprocess_for_cut(x)
x_idx, _ = _coerce_to_type(x_idx)
- quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
+ if is_integer(q):
+ quantiles = np.linspace(0, 1, q + 1)
+ # Round up rather than to nearest if not representable in base 2
+ np.putmask(
+ quantiles,
+ q * quantiles != np.arange(q + 1),
+ np.nextafter(quantiles, 1),
+ )
+ else:
+ quantiles = q
bins = x_idx.to_series().dropna().quantile(quantiles)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 8e6183c43480f..1014c9559afaf 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -969,6 +969,8 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False):
Returns
-------
DataFrame/MultiIndex or Series/Index of objects
+ Returns appropriate type based on `expand` parameter with strings
+ split based on the `sep` parameter.
See Also
--------
@@ -1749,6 +1751,18 @@ def pad(
Returns
-------
Series/Index of objects.
+ A Series or Index where the strings are modified by :meth:`str.%(method)s`.
+
+ See Also
+ --------
+ Series.str.rjust : Fills the left side of strings with an arbitrary
+ character.
+ Series.str.ljust : Fills the right side of strings with an arbitrary
+ character.
+ Series.str.center : Fills both sides of strings with an arbitrary
+ character.
+ Series.str.zfill : Pad strings in the Series/Index by prepending '0'
+ character.
Examples
--------
@@ -2024,11 +2038,19 @@ def decode(self, encoding, errors: str = "strict"):
Parameters
----------
encoding : str
+ Specifies the encoding to be used.
errors : str, optional
+ Specifies the error handling scheme.
+ Possible values are those supported by :meth:`bytes.decode`.
Returns
-------
Series or Index
+ A Series or Index with decoded strings.
+
+ See Also
+ --------
+ Series.str.encode : Encodes strings into bytes in a Series/Index.
Examples
--------
@@ -2063,11 +2085,19 @@ def encode(self, encoding, errors: str = "strict"):
Parameters
----------
encoding : str
+ Specifies the encoding to be used.
errors : str, optional
+ Specifies the error handling scheme.
+ Possible values are those supported by :meth:`str.encode`.
Returns
-------
Series/Index of objects
+ A Series or Index with strings encoded into bytes.
+
+ See Also
+ --------
+ Series.str.decode : Decodes bytes into strings in a Series/Index.
Examples
--------
@@ -2099,6 +2129,7 @@ def encode(self, encoding, errors: str = "strict"):
Returns
-------
Series or Index of object
+ Series or Index with the strings being stripped from the %(side)s.
See Also
--------
@@ -3092,6 +3123,8 @@ def normalize(self, form):
Returns
-------
Series or Index of object
+ Returns a Series or an Index of the %(side)s indexes
+ in each string of the input.
See Also
--------
@@ -3207,7 +3240,8 @@ def len(self):
Returns
-------
- Series or Index of object
+ Series or Index of objects
+ A Series or Index where the strings are modified by :meth:`str.%(method)s`.
See Also
--------
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 290a28ab60ae1..100afa956bd24 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -37,8 +37,6 @@ class ObjectStringArrayMixin(BaseStringArrayMethods):
String Methods operating on object-dtype ndarrays.
"""
- _str_na_value = np.nan
-
def __len__(self) -> int:
# For typing, _str_map relies on the object being sized.
raise NotImplementedError
@@ -56,7 +54,7 @@ def _str_map(
na_value : Scalar, optional
The value to set for NA values. Might also be used for the
fill value if the callable `f` raises an exception.
- This defaults to ``self._str_na_value`` which is ``np.nan``
+ This defaults to ``self.dtype.na_value`` which is ``np.nan``
for object-dtype and Categorical and ``pd.NA`` for StringArray.
dtype : Dtype, optional
The dtype of the result array.
@@ -66,7 +64,7 @@ def _str_map(
if dtype is None:
dtype = np.dtype("object")
if na_value is None:
- na_value = self._str_na_value
+ na_value = self.dtype.na_value # type: ignore[attr-defined]
if not len(self):
return np.array([], dtype=dtype)
@@ -272,7 +270,7 @@ def f(x):
return x.get(i)
elif len(x) > i >= -len(x):
return x[i]
- return self._str_na_value
+ return self.dtype.na_value # type: ignore[attr-defined]
return self._str_map(f)
@@ -466,7 +464,7 @@ def _str_removesuffix(self, suffix: str):
def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
regex = re.compile(pat, flags=flags)
- na_value = self._str_na_value
+ na_value = self.dtype.na_value # type: ignore[attr-defined]
if not expand:
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
index 26e73794af298..982851d0557c3 100644
--- a/pandas/core/tools/numeric.py
+++ b/pandas/core/tools/numeric.py
@@ -99,8 +99,8 @@ def to_numeric(
is to not use nullable data types. If specified, the behavior
is as follows:
- * ``"numpy_nullable"``: returns with nullable-dtype-backed
- * ``"pyarrow"``: returns with pyarrow-backed nullable :class:`ArrowDtype`
+ * ``"numpy_nullable"``: returns nullable-dtype-backed object
+ * ``"pyarrow"``: returns with pyarrow-backed nullable object
.. versionadded:: 2.0
diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py
index 5a0a8c321e629..2ed241f0b9bca 100644
--- a/pandas/io/clipboards.py
+++ b/pandas/io/clipboards.py
@@ -38,14 +38,15 @@ def read_clipboard(
A string or regex delimiter. The default of ``'\\s+'`` denotes
one or more whitespace characters.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index f83f9cb1c8d74..ef52107c283e9 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -267,14 +267,15 @@
Rows at the end to skip (0-indexed).
{storage_options}
-dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
@@ -1728,14 +1729,15 @@ def parse(
comment string and the end of the current line is ignored.
skipfooter : int, default 0
Rows at the end to skip (0-indexed).
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
**kwds : dict, optional
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
index 3df3e77a851a3..aaae9857b4fae 100644
--- a/pandas/io/feather_format.py
+++ b/pandas/io/feather_format.py
@@ -92,14 +92,15 @@ def read_feather(
Whether to parallelize reading using multiple threads.
{storage_options}
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
- * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 4b8bc48130fab..c9897f628fdc9 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -1131,14 +1131,15 @@ def read_html(
.. versionadded:: 1.5.0
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index b29ead1d14b1d..d077b9e0c4568 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -652,14 +652,15 @@ def read_json(
{storage_options}
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py
index d966e38fa11a5..9d250ee5c08ce 100644
--- a/pandas/io/json/_table_schema.py
+++ b/pandas/io/json/_table_schema.py
@@ -144,11 +144,11 @@ def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:
field["freq"] = dtype.freq.freqstr
elif isinstance(dtype, DatetimeTZDtype):
if timezones.is_utc(dtype.tz):
- # timezone.utc has no "zone" attr
field["tz"] = "UTC"
else:
- # error: "tzinfo" has no attribute "zone"
- field["tz"] = dtype.tz.zone # type: ignore[attr-defined]
+ zone = timezones.get_timezone(dtype.tz)
+ if isinstance(zone, str):
+ field["tz"] = zone
elif isinstance(dtype, ExtensionDtype):
field["extDtype"] = dtype.name
return field
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index b297164d5d108..f179dafc919e5 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -61,14 +61,15 @@ def read_orc(
Output always follows the ordering of the file and not the columns list.
This mirrors the original behaviour of
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 77a9cc3fca644..24415299e799b 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -542,14 +542,15 @@ def read_parquet(
.. versionadded:: 1.3.0
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 0cca1ebdb8c8f..6e933f94cf0ba 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -268,6 +268,18 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
Number of lines at bottom of file to skip (Unsupported with ``engine='c'``).
nrows : int, optional
Number of rows of file to read. Useful for reading pieces of large files.
+ Refers to the number of data rows in the returned DataFrame, excluding:
+
+ * The header row containing column names.
+ * Rows before the header row, if ``header=1`` or larger.
+
+ Example usage:
+
+ * To read the first 999,999 (non-header) rows:
+ ``read_csv(..., nrows=999999)``
+
+ * To read rows 1,000,000 through 1,999,999:
+ ``read_csv(..., skiprows=1000000, nrows=999999)``
na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional
Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific
per-column ``NA`` values. By default the following values are interpreted as
@@ -438,14 +450,14 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
{storage_options}
-dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
index 313ffa79cbd09..e597463aee453 100644
--- a/pandas/io/spss.py
+++ b/pandas/io/spss.py
@@ -36,14 +36,15 @@ def read_spss(
Return a subset of the columns. If None, return all columns.
convert_categoricals : bool, default is True
Convert categorical columns into pd.Categorical.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed
+ nullable :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 4fd7de7a28855..99dd06568fa01 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -306,14 +306,15 @@ def read_sql_table(
chunksize : int, default None
If specified, returns an iterator where `chunksize` is the number of
rows to include in each chunk.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
@@ -443,14 +444,15 @@ def read_sql_query(
{'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
.. versionadded:: 1.3.0
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
@@ -586,14 +588,15 @@ def read_sql(
chunksize : int, default None
If specified, return an iterator where `chunksize` is the
number of rows to include in each chunk.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
dtype : Type name or dict of columns
@@ -1683,14 +1686,15 @@ def read_table(
chunksize : int, default None
If specified, return an iterator where `chunksize` is the number
of rows to include in each chunk.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
@@ -2148,14 +2152,15 @@ def read_table(
schema of the SQL database object.
chunksize : int, default None
Raises NotImplementedError
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 8c7381a926e72..0fcf27af42fde 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -959,14 +959,15 @@ def read_xml(
{storage_options}
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
index fb7d785a94bc4..9a7e563332a42 100644
--- a/pandas/plotting/_matplotlib/core.py
+++ b/pandas/plotting/_matplotlib/core.py
@@ -546,7 +546,7 @@ def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes:
new_ax.set_yscale("log")
elif self.logy == "sym" or self.loglog == "sym":
new_ax.set_yscale("symlog")
- return new_ax # type: ignore[return-value]
+ return new_ax
@final
@cache_readonly
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index b0475b64a844e..3be3562d23cd6 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -6,6 +6,8 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
@@ -1245,6 +1247,9 @@ def test_agg_multiple_mixed():
tm.assert_frame_equal(result, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
def test_agg_multiple_mixed_raises():
# GH 20909
mdf = DataFrame(
diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py
index 3137d3ff50954..ba970e328ae40 100644
--- a/pandas/tests/apply/test_invalid_arg.py
+++ b/pandas/tests/apply/test_invalid_arg.py
@@ -12,6 +12,9 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
from pandas.errors import SpecificationError
from pandas import (
@@ -209,6 +212,10 @@ def transform(row):
data.apply(transform, axis=1)
+# we should raise a proper TypeError instead of propagating the pyarrow error
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
@pytest.mark.parametrize(
"df, func, expected",
tm.get_cython_table_params(
@@ -229,6 +236,10 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_str
df.agg(func, axis=axis)
+# we should raise a proper TypeError instead of propagating the pyarrow error
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
@pytest.mark.parametrize(
"series, func, expected",
chain(
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index 6ac0b49f0e4e7..6bbe5100e8826 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -104,6 +104,7 @@ def test_numba_nonunique_unsupported(apply_axis):
def test_numba_unsupported_dtypes(apply_axis):
+ pytest.importorskip("pyarrow")
f = lambda x: x
df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]})
df["c"] = df["c"].astype("double[pyarrow]")
diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py
index 4b5156d0007bb..899ea1910d055 100644
--- a/pandas/tests/arithmetic/test_object.py
+++ b/pandas/tests/arithmetic/test_object.py
@@ -8,6 +8,9 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td
import pandas as pd
@@ -315,6 +318,9 @@ def test_add(self):
expected = pd.Index(["1a", "1b", "1c"])
tm.assert_index_equal("1" + index, expected)
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_sub_fail(self, using_infer_string):
index = pd.Index([str(i) for i in range(10)])
diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py
index 0c4fcf149eb20..4dbd8eb9f5ca7 100644
--- a/pandas/tests/arrays/boolean/test_arithmetic.py
+++ b/pandas/tests/arrays/boolean/test_arithmetic.py
@@ -3,6 +3,10 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
+
import pandas as pd
import pandas._testing as tm
@@ -90,6 +94,9 @@ def test_op_int8(left_array, right_array, opname):
# -----------------------------------------------------------------------------
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
# invalid ops
diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
index dca33dffa3996..52fd80cd196e0 100644
--- a/pandas/tests/arrays/categorical/test_analytics.py
+++ b/pandas/tests/arrays/categorical/test_analytics.py
@@ -6,7 +6,10 @@
from pandas._config import using_string_dtype
-from pandas.compat import PYPY
+from pandas.compat import (
+ HAS_PYARROW,
+ PYPY,
+)
from pandas import (
Categorical,
@@ -296,7 +299,9 @@ def test_nbytes(self):
exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
assert cat.nbytes == exp
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+ @pytest.mark.xfail(
+ using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_memory_usage(self):
cat = Categorical([1, 2, 3])
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
index 6752a503016f8..d7eb6800e5d07 100644
--- a/pandas/tests/arrays/categorical/test_constructors.py
+++ b/pandas/tests/arrays/categorical/test_constructors.py
@@ -8,6 +8,8 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
@@ -442,7 +444,9 @@ def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")
- @pytest.mark.xfail(using_string_dtype(), reason="Can't be NumPy strings")
+ @pytest.mark.xfail(
+ using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings"
+ )
def test_constructor_np_strs(self):
# GH#31499 Hashtable.map_locations needs to work on np.str_ objects
cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py
index db04862e4ea07..e485c7f79b475 100644
--- a/pandas/tests/arrays/integer/test_reduction.py
+++ b/pandas/tests/arrays/integer/test_reduction.py
@@ -1,6 +1,8 @@
import numpy as np
import pytest
+from pandas.compat import HAS_PYARROW
+
import pandas as pd
from pandas import (
DataFrame,
@@ -102,9 +104,10 @@ def test_groupby_reductions(op, expected):
["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
],
)
-def test_mixed_reductions(op, expected, using_infer_string):
- if op in ["any", "all"] and using_infer_string:
- expected = expected.astype("bool")
+def test_mixed_reductions(request, op, expected, using_infer_string):
+ if op in ["any", "all"] and using_infer_string and HAS_PYARROW:
+ # TODO(infer_string) inconsistent result type
+ request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
df = DataFrame(
{
"A": ["a", "b", "b"],
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 7d4aae0f7bb4e..cfba32c62f206 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -4,7 +4,6 @@
import numpy as np
import pytest
-from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td
import pandas as pd
@@ -27,11 +26,10 @@ def test_eq_all_na():
tm.assert_extension_array_equal(result, expected)
-def test_config(string_storage, request, using_infer_string):
- if using_infer_string and string_storage == "python" and HAS_PYARROW:
- # string storage with na_value=NaN always uses pyarrow if available
- # -> does not yet honor the option
- request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
+def test_config(string_storage, using_infer_string):
+ # with the default string_storage setting
+ # always "python" at the moment
+ assert StringDtype().storage == "python"
with pd.option_context("string_storage", string_storage):
assert StringDtype().storage == string_storage
@@ -88,19 +86,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
ArrowStringArray(arr)
-@pytest.mark.xfail(
- reason="dict conversion does not seem to be implemented for large string in arrow"
-)
+@pytest.mark.parametrize("string_type", ["string", "large_string"])
@pytest.mark.parametrize("chunked", [True, False])
-def test_constructor_valid_string_type_value_dictionary(chunked):
+def test_constructor_valid_string_type_value_dictionary(string_type, chunked):
pa = pytest.importorskip("pyarrow")
- arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
+ arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode()
if chunked:
arr = pa.chunked_array(arr)
arr = ArrowStringArray(arr)
- assert pa.types.is_string(arr._pa_array.type.value_type)
+ # dictionary type get converted to dense large string array
+ assert pa.types.is_large_string(arr._pa_array.type)
def test_constructor_from_list():
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index 5834b268be2be..59ff4f3122e8f 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -891,20 +891,24 @@ def test_concat_same_type_different_freq(self, unit):
tm.assert_datetime_array_equal(result, expected)
- def test_strftime(self, arr1d):
+ def test_strftime(self, arr1d, using_infer_string):
arr = arr1d
result = arr.strftime("%Y %b")
expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
+ if using_infer_string:
+ expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+ tm.assert_equal(result, expected)
- def test_strftime_nat(self):
+ def test_strftime_nat(self, using_infer_string):
# GH 29578
arr = DatetimeIndex(["2019-01-01", NaT])._data
result = arr.strftime("%Y-%m-%d")
expected = np.array(["2019-01-01", np.nan], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
+ if using_infer_string:
+ expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+ tm.assert_equal(result, expected)
class TestTimedeltaArray(SharedTests):
@@ -1161,20 +1165,24 @@ def test_array_interface(self, arr1d):
expected = np.asarray(arr).astype("S20")
tm.assert_numpy_array_equal(result, expected)
- def test_strftime(self, arr1d):
+ def test_strftime(self, arr1d, using_infer_string):
arr = arr1d
result = arr.strftime("%Y")
expected = np.array([per.strftime("%Y") for per in arr], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
+ if using_infer_string:
+ expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+ tm.assert_equal(result, expected)
- def test_strftime_nat(self):
+ def test_strftime_nat(self, using_infer_string):
# GH 29578
arr = PeriodArray(PeriodIndex(["2019-01-01", NaT], dtype="period[D]"))
result = arr.strftime("%Y-%m-%d")
expected = np.array(["2019-01-01", np.nan], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
+ if using_infer_string:
+ expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+ tm.assert_equal(result, expected)
@pytest.mark.parametrize(
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
index dd6bf3c7521f8..13a3ff048c79e 100644
--- a/pandas/tests/base/test_conversion.py
+++ b/pandas/tests/base/test_conversion.py
@@ -1,6 +1,10 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
+
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
@@ -20,6 +24,7 @@
SparseArray,
TimedeltaArray,
)
+from pandas.core.arrays.string_ import StringArrayNumpySemantics
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
@@ -218,7 +223,9 @@ def test_iter_box_period(self):
)
def test_values_consistent(arr, expected_type, dtype, using_infer_string):
if using_infer_string and dtype == "object":
- expected_type = ArrowStringArrayNumpySemantics
+ expected_type = (
+ ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArrayNumpySemantics
+ )
l_values = Series(arr)._values
r_values = pd.Index(arr)._values
assert type(l_values) is expected_type
@@ -355,6 +362,9 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request):
tm.assert_numpy_array_equal(result, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
@pytest.mark.parametrize("as_series", [True, False])
@pytest.mark.parametrize(
"arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)]
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index 8724f62de1534..de56d5e4a07ee 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -5,6 +5,7 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import pa_version_under12p0
import pandas.util._test_decorators as td
@@ -197,7 +198,7 @@ def test_astype_arrow_timestamp():
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_convert_dtypes_infer_objects():
ser = Series(["a", "b", "c"])
ser_orig = ser.copy()
@@ -213,7 +214,7 @@ def test_convert_dtypes_infer_objects():
tm.assert_series_equal(ser, ser_orig)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_convert_dtypes():
df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
df_orig = df.copy()
diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py
index d2e2d43b0a42b..dd4dd154f74b0 100644
--- a/pandas/tests/copy_view/test_functions.py
+++ b/pandas/tests/copy_view/test_functions.py
@@ -3,6 +3,8 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
from pandas import (
DataFrame,
Index,
@@ -14,7 +16,7 @@
from pandas.tests.copy_view.util import get_array
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_concat_frames():
df = DataFrame({"b": ["a"] * 3})
df2 = DataFrame({"a": ["a"] * 3})
@@ -33,7 +35,7 @@ def test_concat_frames():
tm.assert_frame_equal(df, df_orig)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_concat_frames_updating_input():
df = DataFrame({"b": ["a"] * 3})
df2 = DataFrame({"a": ["a"] * 3})
@@ -153,7 +155,7 @@ def test_concat_copy_keyword():
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize(
"func",
[
@@ -249,7 +251,7 @@ def test_merge_copy_keyword():
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_join_on_key():
df_index = Index(["a", "b", "c"], name="key")
@@ -277,7 +279,7 @@ def test_join_on_key():
tm.assert_frame_equal(df2, df2_orig)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_join_multiple_dataframes_on_key():
df_index = Index(["a", "b", "c"], name="key")
diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py
index f80e9b7dcf838..fc57178b897b9 100644
--- a/pandas/tests/copy_view/test_interp_fillna.py
+++ b/pandas/tests/copy_view/test_interp_fillna.py
@@ -3,6 +3,8 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
from pandas import (
NA,
DataFrame,
@@ -121,7 +123,7 @@ def test_interpolate_cannot_with_object_dtype():
df.interpolate()
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_interpolate_object_convert_no_op():
df = DataFrame({"a": ["a", "b", "c"], "b": 1})
arr_a = get_array(df, "a")
diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
index 3716df8fbf855..92e1ba750fae2 100644
--- a/pandas/tests/copy_view/test_methods.py
+++ b/pandas/tests/copy_view/test_methods.py
@@ -3,6 +3,8 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
import pandas as pd
from pandas import (
DataFrame,
@@ -714,7 +716,7 @@ def test_head_tail(method):
tm.assert_frame_equal(df, df_orig)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_infer_objects():
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
df_orig = df.copy()
@@ -730,6 +732,9 @@ def test_infer_objects():
tm.assert_frame_equal(df, df_orig)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
def test_infer_objects_no_reference():
df = DataFrame(
{
@@ -899,7 +904,7 @@ def test_sort_values_inplace(obj, kwargs):
tm.assert_equal(view, obj_orig)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize("decimals", [-1, 0, 1])
def test_round(decimals):
df = DataFrame({"a": [1, 2], "b": "c"})
diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
index c1120ccfea635..58c979fb05089 100644
--- a/pandas/tests/copy_view/test_replace.py
+++ b/pandas/tests/copy_view/test_replace.py
@@ -3,6 +3,8 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
from pandas import (
Categorical,
DataFrame,
@@ -59,7 +61,7 @@ def test_replace_regex_inplace_refs():
tm.assert_frame_equal(view, df_orig)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_replace_regex_inplace():
df = DataFrame({"a": ["aaa", "bbb"]})
arr = get_array(df, "a")
@@ -257,7 +259,7 @@ def test_replace_empty_list():
assert not df2._mgr._has_no_reference(0)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize("value", ["d", None])
def test_replace_object_list_inplace(value):
df = DataFrame({"a": ["a", "b", "c"]})
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index 4bf97b1fd8494..2c2dff7a957fe 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -3,6 +3,7 @@
import numpy as np
import pytest
+from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td
from pandas.core.dtypes.astype import astype_array
@@ -802,13 +803,17 @@ def test_pandas_dtype_ea_not_instance():
def test_pandas_dtype_string_dtypes(string_storage):
- # TODO(infer_string) remove skip if "python" is supported
- pytest.importorskip("pyarrow")
+ with pd.option_context("future.infer_string", True):
+ # with the default string_storage setting
+ result = pandas_dtype("str")
+ assert result == pd.StringDtype(
+ "pyarrow" if HAS_PYARROW else "python", na_value=np.nan
+ )
+
with pd.option_context("future.infer_string", True):
with pd.option_context("string_storage", string_storage):
result = pandas_dtype("str")
- # TODO(infer_string) hardcoded to pyarrow until python is supported
- assert result == pd.StringDtype("pyarrow", na_value=np.nan)
+ assert result == pd.StringDtype(string_storage, na_value=np.nan)
with pd.option_context("future.infer_string", False):
with pd.option_context("string_storage", string_storage):
diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py
index fad2560265d21..ff9f3cbed64a2 100644
--- a/pandas/tests/extension/base/ops.py
+++ b/pandas/tests/extension/base/ops.py
@@ -7,6 +7,8 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
from pandas.core.dtypes.common import is_string_dtype
import pandas as pd
@@ -140,6 +142,12 @@ class BaseArithmeticOpsTests(BaseOpsUtil):
series_array_exc: type[Exception] | None = TypeError
divmod_exc: type[Exception] | None = TypeError
+ # TODO(infer_string) need to remove import of pyarrow
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW,
+ reason="TODO(infer_string)",
+ strict=False,
+ )
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
# series & scalar
if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
@@ -149,6 +157,11 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
ser = pd.Series(data)
self.check_opname(ser, op_name, ser.iloc[0])
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW,
+ reason="TODO(infer_string)",
+ strict=False,
+ )
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
# frame & scalar
if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
@@ -158,12 +171,22 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
df = pd.DataFrame({"A": data})
self.check_opname(df, op_name, data[0])
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW,
+ reason="TODO(infer_string)",
+ strict=False,
+ )
def test_arith_series_with_array(self, data, all_arithmetic_operators):
# ndarray & other series
op_name = all_arithmetic_operators
ser = pd.Series(data)
self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser)))
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW,
+ reason="TODO(infer_string)",
+ strict=False,
+ )
def test_divmod(self, data):
ser = pd.Series(data)
self._check_divmod_op(ser, divmod, 1)
@@ -179,6 +202,7 @@ def test_divmod_series_array(self, data, data_for_twos):
other = pd.Series(other)
self._check_divmod_op(other, ops.rdivmod, ser)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_add_series_with_extension_array(self, data):
# Check adding an ExtensionArray to a Series of the same dtype matches
# the behavior of adding the arrays directly and then wrapping in a
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
index 8f8af607585df..c3d4b83f731a3 100644
--- a/pandas/tests/extension/test_categorical.py
+++ b/pandas/tests/extension/test_categorical.py
@@ -140,6 +140,7 @@ def test_map(self, data, na_action):
result = data.map(lambda x: x, na_action=na_action)
tm.assert_extension_array_equal(result, data)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
# frame & scalar
op_name = all_arithmetic_operators
@@ -151,6 +152,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
)
super().test_arith_frame_with_scalar(data, op_name)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
op_name = all_arithmetic_operators
if op_name == "__rmod__":
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
index 79cfb736941d6..1b251a5118681 100644
--- a/pandas/tests/extension/test_numpy.py
+++ b/pandas/tests/extension/test_numpy.py
@@ -19,6 +19,8 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
from pandas.core.dtypes.dtypes import NumpyEADtype
import pandas as pd
@@ -255,6 +257,7 @@ def test_insert_invalid(self, data, invalid_scalar):
frame_scalar_exc = None
series_array_exc = None
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_divmod(self, data):
divmod_exc = None
if data.dtype.kind == "O":
@@ -262,6 +265,7 @@ def test_divmod(self, data):
self.divmod_exc = divmod_exc
super().test_divmod(data)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_divmod_series_array(self, data):
ser = pd.Series(data)
exc = None
@@ -270,6 +274,7 @@ def test_divmod_series_array(self, data):
self.divmod_exc = exc
self._check_divmod_op(ser, divmod, data)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
opname = all_arithmetic_operators
series_scalar_exc = None
@@ -283,6 +288,7 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request)
self.series_scalar_exc = series_scalar_exc
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_arith_series_with_array(self, data, all_arithmetic_operators):
opname = all_arithmetic_operators
series_array_exc = None
@@ -291,6 +297,7 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators):
self.series_array_exc = series_array_exc
super().test_arith_series_with_array(data, all_arithmetic_operators)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
opname = all_arithmetic_operators
frame_scalar_exc = None
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index 826ac2be3339b..8ce4e8725d632 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -12,6 +12,7 @@
from pandas._config import using_string_dtype
from pandas._libs import iNaT
+from pandas.compat import HAS_PYARROW
from pandas.errors import InvalidIndexError
from pandas.core.dtypes.common import is_integer
@@ -1148,7 +1149,9 @@ def test_loc_setitem_datetimelike_with_inference(self):
)
tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+ @pytest.mark.xfail(
+ using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_getitem_boolean_indexing_mixed(self):
df = DataFrame(
{
diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
index 1d7b3e12b2e86..32a827c25c77a 100644
--- a/pandas/tests/frame/indexing/test_where.py
+++ b/pandas/tests/frame/indexing/test_where.py
@@ -6,6 +6,8 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
from pandas.core.dtypes.common import is_scalar
import pandas as pd
@@ -938,6 +940,9 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype):
obj.mask(mask, null)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
@given(data=OPTIONAL_ONE_OF_ALL)
def test_where_inplace_casting(data):
# GH 22051
@@ -1018,6 +1023,9 @@ def test_where_producing_ea_cond_for_np_dtype():
tm.assert_frame_equal(result, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
@pytest.mark.parametrize(
"replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)]
)
diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
index 59779234b46d9..e7f6e5d625d3e 100644
--- a/pandas/tests/frame/methods/test_convert_dtypes.py
+++ b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -3,21 +3,15 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
import pandas._testing as tm
class TestConvertDtypes:
- # TODO convert_dtypes should not use NaN variant of string dtype, but always NA
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
)
- def test_convert_dtypes(
- self, convert_integer, expected, string_storage, using_infer_string
- ):
+ def test_convert_dtypes(self, convert_integer, expected, string_storage):
# Specific types are tested in tests/series/test_dtypes.py
# Just check that it works for DataFrame here
df = pd.DataFrame(
@@ -182,7 +176,6 @@ def test_convert_dtypes_pyarrow_timestamp(self):
result = expected.convert_dtypes(dtype_backend="pyarrow")
tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_convert_dtypes_avoid_block_splitting(self):
# GH#55341
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
@@ -197,7 +190,6 @@ def test_convert_dtypes_avoid_block_splitting(self):
tm.assert_frame_equal(result, expected)
assert result._mgr.nblocks == 2
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_convert_dtypes_from_arrow(self):
# GH#56581
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py
index 419fb75cb3669..7feb3b6fd816d 100644
--- a/pandas/tests/frame/methods/test_drop_duplicates.py
+++ b/pandas/tests/frame/methods/test_drop_duplicates.py
@@ -476,3 +476,41 @@ def test_drop_duplicates_non_boolean_ignore_index(arg):
msg = '^For argument "ignore_index" expected type bool, received type .*.$'
with pytest.raises(ValueError, match=msg):
df.drop_duplicates(ignore_index=arg)
+
+
+def test_drop_duplicates_set():
+ # GH#59237
+ df = DataFrame(
+ {
+ "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
+ "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
+ "C": [1, 1, 2, 2, 2, 2, 1, 2],
+ "D": range(8),
+ }
+ )
+ # single column
+ result = df.drop_duplicates({"AAA"})
+ expected = df[:2]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates({"AAA"}, keep="last")
+ expected = df.loc[[6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates({"AAA"}, keep=False)
+ expected = df.loc[[]]
+ tm.assert_frame_equal(result, expected)
+ assert len(result) == 0
+
+ # multi column
+ expected = df.loc[[0, 1, 2, 3]]
+ result = df.drop_duplicates({"AAA", "B"})
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates({"AAA", "B"}, keep="last")
+ expected = df.loc[[0, 5, 6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates({"AAA", "B"}, keep=False)
+ expected = df.loc[[0]]
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py
index a4319f8a8ae7f..aad43b7a77ac7 100644
--- a/pandas/tests/frame/methods/test_info.py
+++ b/pandas/tests/frame/methods/test_info.py
@@ -10,6 +10,7 @@
from pandas._config import using_string_dtype
from pandas.compat import (
+ HAS_PYARROW,
IS64,
PYPY,
)
@@ -520,7 +521,7 @@ def test_info_int_columns():
assert result == expected
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_memory_usage_empty_no_warning():
# GH#50066
df = DataFrame(index=["a", "b"])
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
index 79aabbcc83bbf..4e8e267523439 100644
--- a/pandas/tests/frame/methods/test_rank.py
+++ b/pandas/tests/frame/methods/test_rank.py
@@ -6,10 +6,13 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
from pandas._libs.algos import (
Infinity,
NegInfinity,
)
+from pandas.compat import HAS_PYARROW
from pandas import (
DataFrame,
@@ -464,9 +467,18 @@ def test_rank_inf_nans_na_option(
],
)
def test_rank_object_first(
- self, frame_or_series, na_option, ascending, expected, using_infer_string
+ self,
+ request,
+ frame_or_series,
+ na_option,
+ ascending,
+ expected,
+ using_infer_string,
):
obj = frame_or_series(["foo", "foo", None, "foo"])
+ if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series):
+ request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
+
result = obj.rank(method="first", na_option=na_option, ascending=ascending)
expected = frame_or_series(expected)
if using_infer_string and isinstance(obj, Series):
diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py
index 4136d641ef67f..7670b53f23173 100644
--- a/pandas/tests/frame/methods/test_value_counts.py
+++ b/pandas/tests/frame/methods/test_value_counts.py
@@ -1,6 +1,10 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
+
import pandas as pd
import pandas._testing as tm
@@ -132,6 +136,9 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture):
tm.assert_series_equal(result, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
def test_data_frame_value_counts_dropna_false(nulls_fixture):
# GH 41334
df = pd.DataFrame(
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
index e8ef0592ac432..f8219e68a72da 100644
--- a/pandas/tests/frame/test_api.py
+++ b/pandas/tests/frame/test_api.py
@@ -8,6 +8,8 @@
from pandas._config import using_string_dtype
from pandas._config.config import option_context
+from pandas.compat import HAS_PYARROW
+
import pandas as pd
from pandas import (
DataFrame,
@@ -113,7 +115,9 @@ def test_not_hashable(self):
with pytest.raises(TypeError, match=msg):
hash(empty_frame)
- @pytest.mark.xfail(using_string_dtype(), reason="surrogates not allowed")
+ @pytest.mark.xfail(
+ using_string_dtype() and HAS_PYARROW, reason="surrogates not allowed"
+ )
def test_column_name_contains_unicode_surrogate(self):
# GH 25509
colname = "\ud83d"
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
index 734bfc8b30053..e41a3b27e592c 100644
--- a/pandas/tests/frame/test_arithmetic.py
+++ b/pandas/tests/frame/test_arithmetic.py
@@ -13,6 +13,8 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
import pandas as pd
from pandas import (
DataFrame,
@@ -1542,7 +1544,9 @@ def test_comparisons(self, simple_frame, float_frame, func):
with pytest.raises(ValueError, match=msg):
func(simple_frame, simple_frame[:2])
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+ @pytest.mark.xfail(
+ using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne):
# GH 11565
df = DataFrame(
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index a210af94561f9..0176a36fe78d7 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -24,6 +24,7 @@
from pandas._config import using_string_dtype
from pandas._libs import lib
+from pandas.compat import HAS_PYARROW
from pandas.compat.numpy import np_version_gt2
from pandas.errors import IntCastingNaNError
@@ -299,7 +300,7 @@ def test_constructor_dtype_nocast_view_2d_array(self):
df2 = DataFrame(df.values, dtype=df[0].dtype)
assert df2._mgr.blocks[0].values.flags.c_contiguous
- @pytest.mark.xfail(using_string_dtype(), reason="conversion copies")
+ @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies")
def test_1d_object_array_does_not_copy(self):
# https://github.com/pandas-dev/pandas/issues/39272
arr = np.array(["a", "b"], dtype="object")
diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py
index ad54cfaf9d927..6788721e8a72e 100644
--- a/pandas/tests/frame/test_logical_ops.py
+++ b/pandas/tests/frame/test_logical_ops.py
@@ -4,6 +4,10 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
+
from pandas import (
CategoricalIndex,
DataFrame,
@@ -96,6 +100,9 @@ def test_logical_ops_int_frame(self):
res_ser = df1a_int["A"] | df1a_bool["A"]
tm.assert_series_equal(res_ser, df1a_bool["A"])
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_logical_ops_invalid(self, using_infer_string):
# GH#5808
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 4c355ed92b6c3..1d667d35db253 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -226,6 +226,7 @@ def float_frame_with_na():
class TestDataFrameAnalytics:
# ---------------------------------------------------------------------
# Reductions
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize(
"opname",
@@ -431,6 +432,7 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis):
expected[expected.isna()] = None
tm.assert_series_equal(result, expected)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
def test_mixed_ops(self, op):
# GH#16116
@@ -532,7 +534,7 @@ def test_mean_mixed_string_decimal(self):
df = DataFrame(d)
with pytest.raises(
- TypeError, match="unsupported operand type|does not support"
+ TypeError, match="unsupported operand type|does not support|Cannot perform"
):
df.mean()
result = df[["A", "C"]].mean()
@@ -690,6 +692,7 @@ def test_mode_dropna(self, dropna, expected):
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_mode_sortwarning(self, using_infer_string):
# Check for the warning that is raised when the mode
# results cannot be sorted
@@ -979,7 +982,7 @@ def test_sum_mixed_datetime(self):
def test_mean_corner(self, float_frame, float_string_frame):
# unit test when have object data
- msg = "Could not convert|does not support"
+ msg = "Could not convert|does not support|Cannot perform"
with pytest.raises(TypeError, match=msg):
float_string_frame.mean(axis=0)
@@ -1093,6 +1096,7 @@ def test_idxmin_empty(self, index, skipna, axis):
expected = Series(dtype=index.dtype)
tm.assert_series_equal(result, expected)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_idxmin_numeric_only(self, numeric_only):
df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
@@ -1143,6 +1147,7 @@ def test_idxmax_empty(self, index, skipna, axis):
expected = Series(dtype=index.dtype)
tm.assert_series_equal(result, expected)
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_idxmax_numeric_only(self, numeric_only):
df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
@@ -1964,7 +1969,7 @@ def test_minmax_extensionarray(method, numeric_only):
def test_frame_mixed_numeric_object_with_timestamp(ts_value):
# GH 13912
df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]})
- with pytest.raises(TypeError, match="does not support operation"):
+ with pytest.raises(TypeError, match="does not support operation|Cannot perform"):
df.sum()
diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py
index 1887fa61ad081..5bbe047078c6e 100644
--- a/pandas/tests/frame/test_unary.py
+++ b/pandas/tests/frame/test_unary.py
@@ -5,6 +5,7 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
from pandas.compat.numpy import np_version_gte1p25
import pandas as pd
@@ -42,6 +43,11 @@ def test_neg_object(self, df, expected):
tm.assert_frame_equal(-df, expected)
tm.assert_series_equal(-df["a"], expected["a"])
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW,
+ reason="TODO(infer_string)",
+ strict=False,
+ )
@pytest.mark.parametrize(
"df_data",
[
@@ -130,7 +136,9 @@ def test_pos_object(self, df_data):
tm.assert_frame_equal(+df, df)
tm.assert_series_equal(+df["a"], df["a"])
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+ @pytest.mark.xfail(
+ using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)"
+ )
@pytest.mark.filterwarnings("ignore:Applying:DeprecationWarning")
def test_pos_object_raises(self):
# GH#21380
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
index 14d3dbd6fa496..18802ebd002fc 100644
--- a/pandas/tests/groupby/methods/test_value_counts.py
+++ b/pandas/tests/groupby/methods/test_value_counts.py
@@ -9,6 +9,7 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td
from pandas import (
@@ -500,6 +501,9 @@ def test_dropna_combinations(
tm.assert_series_equal(result, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
@pytest.mark.parametrize(
"dropna, expected_data, expected_index",
[
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 791f279bffc94..11b874d0b1608 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -8,6 +8,7 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
from pandas.errors import SpecificationError
import pandas.util._test_decorators as td
@@ -1407,6 +1408,10 @@ def g(group):
tm.assert_series_equal(result, expected)
+# TODO harmonize error messages
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
@pytest.mark.parametrize("grouper", ["A", ["A", "B"]])
def test_set_group_name(df, grouper, using_infer_string):
def f(group):
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index d42aa06d6bbfe..02071acf378dd 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -3,6 +3,7 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import pa_version_under10p1
from pandas.core.dtypes.missing import na_value_for_dtype
@@ -12,6 +13,9 @@
from pandas.tests.groupby import get_groupby_method_args
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
@@ -55,6 +59,9 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
tm.assert_frame_equal(grouped, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
@@ -131,6 +138,9 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
tm.assert_frame_equal(grouped, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
@pytest.mark.parametrize(
"dropna, idx, expected",
[
@@ -205,6 +215,9 @@ def test_groupby_dataframe_slice_then_transform(dropna, index):
tm.assert_series_equal(result, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
@@ -286,6 +299,9 @@ def test_groupby_dropna_datetime_like_data(
tm.assert_frame_equal(grouped, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
+)
@pytest.mark.parametrize(
"dropna, data, selected_data, levels",
[
@@ -371,6 +387,9 @@ def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
tm.assert_equal(result, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
def test_groupby_nan_included():
# GH 35646
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py
index c6697fd169e8a..78a79ac7d1546 100644
--- a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py
+++ b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py
@@ -9,7 +9,6 @@
from dateutil.tz import gettz
import numpy as np
import pytest
-import pytz
from pandas import (
DatetimeIndex,
@@ -69,10 +68,10 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self):
times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"]
index = DatetimeIndex(times)
tz = "US/Eastern"
- with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)):
+ with pytest.raises(ValueError, match="|".join(times)):
index.tz_localize(tz=tz)
- with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)):
+ with pytest.raises(ValueError, match="|".join(times)):
index.tz_localize(tz=tz, nonexistent="raise")
result = index.tz_localize(tz=tz, nonexistent="NaT")
@@ -85,7 +84,7 @@ def test_dti_tz_localize_ambiguous_infer(self, tz):
# November 6, 2011, fall back, repeat 2 AM hour
# With no repeated hours, we cannot infer the transition
dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour())
- with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"):
+ with pytest.raises(ValueError, match="Cannot infer dst time"):
dr.tz_localize(tz)
def test_dti_tz_localize_ambiguous_infer2(self, tz, unit):
@@ -117,7 +116,7 @@ def test_dti_tz_localize_ambiguous_infer3(self, tz):
def test_dti_tz_localize_ambiguous_times(self, tz):
# March 13, 2011, spring forward, skip from 2 AM to 3 AM
dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=offsets.Hour())
- with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:30:00"):
+ with pytest.raises(ValueError, match="2011-03-13 02:30:00"):
dr.tz_localize(tz)
# after dst transition, it works
@@ -127,7 +126,7 @@ def test_dti_tz_localize_ambiguous_times(self, tz):
# November 6, 2011, fall back, repeat 2 AM hour
dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=offsets.Hour())
- with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"):
+ with pytest.raises(ValueError, match="Cannot infer dst time"):
dr.tz_localize(tz)
# UTC is OK
@@ -163,11 +162,11 @@ def test_dti_tz_localize(self, prefix):
tm.assert_numpy_array_equal(dti3.values, dti_utc.values)
dti = date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="ms")
- with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"):
+ with pytest.raises(ValueError, match="Cannot infer dst time"):
dti.tz_localize(tzstr)
dti = date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="ms")
- with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"):
+ with pytest.raises(ValueError, match="2011-03-13 02:00:00"):
dti.tz_localize(tzstr)
def test_dti_tz_localize_utc_conversion(self, tz):
@@ -184,7 +183,7 @@ def test_dti_tz_localize_utc_conversion(self, tz):
# DST ambiguity, this should fail
rng = date_range("3/11/2012", "3/12/2012", freq="30min")
# Is this really how it should fail??
- with pytest.raises(pytz.NonExistentTimeError, match="2012-03-11 02:00:00"):
+ with pytest.raises(ValueError, match="2012-03-11 02:00:00"):
rng.tz_localize(tz)
def test_dti_tz_localize_roundtrip(self, tz_aware_fixture):
diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py
index aba440ceeb56b..8da88b97f9ea8 100644
--- a/pandas/tests/indexes/datetimes/test_constructors.py
+++ b/pandas/tests/indexes/datetimes/test_constructors.py
@@ -14,7 +14,6 @@
from dateutil.tz import gettz
import numpy as np
import pytest
-import pytz
from pandas._libs.tslibs import (
astype_overflowsafe,
@@ -750,7 +749,7 @@ def test_disallow_setting_tz(self):
[
None,
"America/Los_Angeles",
- pytz.timezone("America/Los_Angeles"),
+ zoneinfo.ZoneInfo("America/Los_Angeles"),
Timestamp("2000", tz="America/Los_Angeles").tz,
],
)
@@ -765,8 +764,8 @@ def test_constructor_start_end_with_tz(self, tz):
freq="D",
)
tm.assert_index_equal(result, expected)
- # Especially assert that the timezone is consistent for pytz
- assert pytz.timezone("America/Los_Angeles") is result.tz
+ # Especially assert that the timezone is consistent for zoneinfo
+ assert zoneinfo.ZoneInfo("America/Los_Angeles") is result.tz
@pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"])
def test_constructor_with_non_normalized_pytz(self, tz):
@@ -984,6 +983,7 @@ def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request):
# GH#47471 check that we get the same raising behavior in the DTI
# constructor and Timestamp constructor
if isinstance(tz, str) and tz.startswith("pytz/"):
+ pytz = pytest.importorskip("pytz")
tz = pytz.timezone(tz.removeprefix("pytz/"))
dtstr = "2013-11-03 01:59:59.999999"
item = dtstr
@@ -1000,7 +1000,7 @@ def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request):
mark = pytest.mark.xfail(reason="We implicitly get fold=0.")
request.applymarker(mark)
- with pytest.raises(pytz.AmbiguousTimeError, match=dtstr):
+ with pytest.raises(ValueError, match=dtstr):
box_cls(item, tz=tz)
@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py
index b37b5cf74b347..e09883e95ecec 100644
--- a/pandas/tests/indexes/datetimes/test_date_range.py
+++ b/pandas/tests/indexes/datetimes/test_date_range.py
@@ -11,7 +11,6 @@
import numpy as np
import pytest
-import pytz
from pandas._libs.tslibs import timezones
from pandas._libs.tslibs.offsets import (
@@ -881,7 +880,7 @@ def test_date_range_ambiguous_endpoint(self, tz):
# construction with an ambiguous end-point
# GH#11626
- with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"):
+ with pytest.raises(ValueError, match="Cannot infer dst time"):
date_range(
"2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="h"
)
@@ -905,7 +904,7 @@ def test_date_range_ambiguous_endpoint(self, tz):
def test_date_range_nonexistent_endpoint(self, tz, option, expected):
# construction with an nonexistent end-point
- with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"):
+ with pytest.raises(ValueError, match="2019-03-10 02:00:00"):
date_range(
"2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="h"
)
@@ -1259,6 +1258,24 @@ def test_range_with_timezone_and_custombusinessday(self, start, period, expected
expected = DatetimeIndex(expected).as_unit("ns")
tm.assert_index_equal(result, expected)
+ def test_data_range_custombusinessday_partial_time(self, unit):
+ # GH#57456
+ offset = offsets.CustomBusinessDay(weekmask="Sun Mon Tue")
+ start = datetime(2024, 2, 6, 23)
+ # end datetime is partial and not in the offset
+ end = datetime(2024, 2, 14, 14)
+ result = date_range(start, end, freq=offset, unit=unit)
+ expected = DatetimeIndex(
+ [
+ "2024-02-06 23:00:00",
+ "2024-02-11 23:00:00",
+ "2024-02-12 23:00:00",
+ "2024-02-13 23:00:00",
+ ],
+ dtype=f"M8[{unit}]",
+ )
+ tm.assert_index_equal(result, expected)
+
class TestDateRangeNonNano:
def test_date_range_reso_validation(self):
diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py
index e4b8a909add0d..8d9340818b511 100644
--- a/pandas/tests/indexes/datetimes/test_timezones.py
+++ b/pandas/tests/indexes/datetimes/test_timezones.py
@@ -184,11 +184,8 @@ def test_dti_tz_nat(self, tzstr):
assert isna(idx[1])
assert idx[0].tzinfo is not None
- @pytest.mark.parametrize("tzstr", ["pytz/US/Eastern", "dateutil/US/Eastern"])
+ @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
def test_utc_box_timestamp_and_localize(self, tzstr):
- if tzstr.startswith("pytz/"):
- pytest.importorskip("pytz")
- tzstr = tzstr.removeprefix("pytz/")
tz = timezones.maybe_get_tz(tzstr)
rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc")
@@ -203,11 +200,10 @@ def test_utc_box_timestamp_and_localize(self, tzstr):
# right tzinfo
rng = date_range("3/13/2012", "3/14/2012", freq="h", tz="utc")
rng_eastern = rng.tz_convert(tzstr)
- # test not valid for dateutil timezones.
- # assert 'EDT' in repr(rng_eastern[0].tzinfo)
- assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr(
- rng_eastern[0].tzinfo
- )
+ if "dateutil" in tzstr:
+ assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr(
+ rng_eastern[0].tzinfo
+ )
@pytest.mark.parametrize(
"tz", [zoneinfo.ZoneInfo("US/Central"), gettz("US/Central")]
diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py
index 1eeeebd6b8ca9..e3428d1060dbe 100644
--- a/pandas/tests/indexes/object/test_indexing.py
+++ b/pandas/tests/indexes/object/test_indexing.py
@@ -3,10 +3,13 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
from pandas._libs.missing import (
NA,
is_matching_na,
)
+from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td
import pandas as pd
@@ -29,6 +32,9 @@ def test_get_indexer_strings(self, method, expected):
tm.assert_numpy_array_equal(actual, expected)
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_get_indexer_strings_raises(self, using_infer_string):
index = Index(["b", "c"])
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 0911f2aec74d6..7ec66100b7291 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -8,7 +8,12 @@
import numpy as np
import pytest
-from pandas.compat import IS64
+from pandas._config import using_string_dtype
+
+from pandas.compat import (
+ HAS_PYARROW,
+ IS64,
+)
from pandas.errors import InvalidIndexError
import pandas.util._test_decorators as td
@@ -71,6 +76,9 @@ def test_constructor_casting(self, index):
tm.assert_contains_all(arr, new_index)
tm.assert_index_equal(index, new_index)
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_constructor_copy(self, using_infer_string):
index = Index(list("abc"), name="name")
arr = np.array(index)
@@ -335,6 +343,11 @@ def test_constructor_empty_special(self, empty, klass):
def test_view_with_args(self, index):
index.view("i8")
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW,
+ reason="TODO(infer_string)",
+ strict=False,
+ )
@pytest.mark.parametrize(
"index",
[
@@ -817,6 +830,11 @@ def test_isin(self, values, index, expected):
expected = np.array(expected, dtype=bool)
tm.assert_numpy_array_equal(result, expected)
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW,
+ reason="TODO(infer_string)",
+ strict=False,
+ )
def test_isin_nan_common_object(
self, nulls_fixture, nulls_fixture2, using_infer_string
):
diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py
index 6d01ba6adc87a..9993a21d93f12 100644
--- a/pandas/tests/indexes/test_old_base.py
+++ b/pandas/tests/indexes/test_old_base.py
@@ -9,6 +9,7 @@
from pandas._config import using_string_dtype
from pandas._libs.tslibs import Timestamp
+from pandas.compat import HAS_PYARROW
from pandas.core.dtypes.common import (
is_integer_dtype,
@@ -245,6 +246,11 @@ def test_repr_max_seq_item_setting(self, simple_index):
repr(idx)
assert "..." not in str(idx)
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW,
+ reason="TODO(infer_string)",
+ strict=False,
+ )
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_ensure_copied_data(self, index):
# Check the "copy" argument of each Index.__new__ is honoured
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index 247501f1504e7..e007b8c4e97ac 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -16,6 +16,7 @@
from pandas._config import using_string_dtype
from pandas._libs import index as libindex
+from pandas.compat import HAS_PYARROW
from pandas.errors import IndexingError
import pandas as pd
@@ -1388,6 +1389,9 @@ def test_loc_setitem_categorical_values_partial_column_slice(self):
df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"])
df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_loc_setitem_single_row_categorical(self, using_infer_string):
# GH#25495
df = DataFrame({"Alpha": ["a"], "Numeric": [0]})
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 65a52bc8e0794..b831ec3bb2c6a 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -30,10 +30,6 @@
read_csv,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
engine_params = [
@@ -692,43 +688,33 @@ def test_dtype_backend_and_dtype(self, read_ext, tmp_excel):
)
tm.assert_frame_equal(result, df)
- @pytest.mark.xfail(
- using_string_dtype(), reason="infer_string takes precedence", strict=False
- )
def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel):
# GH#36712
if read_ext in (".xlsb", ".xls"):
pytest.skip(f"No engine for filetype: '{read_ext}'")
- pa = pytest.importorskip("pyarrow")
+ df = DataFrame(
+ {
+ "a": np.array(["a", "b"], dtype=np.object_),
+ "b": np.array(["x", pd.NA], dtype=np.object_),
+ }
+ )
+ df.to_excel(tmp_excel, sheet_name="test", index=False)
with pd.option_context("mode.string_storage", string_storage):
- df = DataFrame(
- {
- "a": np.array(["a", "b"], dtype=np.object_),
- "b": np.array(["x", pd.NA], dtype=np.object_),
- }
- )
- df.to_excel(tmp_excel, sheet_name="test", index=False)
result = pd.read_excel(
tmp_excel, sheet_name="test", dtype_backend="numpy_nullable"
)
- if string_storage == "python":
- expected = DataFrame(
- {
- "a": StringArray(np.array(["a", "b"], dtype=np.object_)),
- "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
- }
- )
- else:
- expected = DataFrame(
- {
- "a": ArrowStringArray(pa.array(["a", "b"])),
- "b": ArrowStringArray(pa.array(["x", None])),
- }
- )
- tm.assert_frame_equal(result, expected)
+ expected = DataFrame(
+ {
+ "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)),
+ "b": Series(["x", None], dtype=pd.StringDtype(string_storage)),
+ }
+ )
+ # the storage of the str columns' Index is also affected by the
+ # string_storage setting -> ignore that for checking the result
+ tm.assert_frame_equal(result, expected, check_column_type=False)
@pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)])
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
index e1cdfb8bfa7e3..44266ae9a62a5 100644
--- a/pandas/tests/io/excel/test_writers.py
+++ b/pandas/tests/io/excel/test_writers.py
@@ -282,7 +282,6 @@ def test_excel_multindex_roundtrip(
)
tm.assert_frame_equal(df, act, check_names=check_names)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_read_excel_parse_dates(self, tmp_excel):
# see gh-11544, gh-12051
df = DataFrame(
diff --git a/pandas/tests/io/formats/style/test_bar.py b/pandas/tests/io/formats/style/test_bar.py
index b0e4712e8bb3d..d28c7c566d851 100644
--- a/pandas/tests/io/formats/style/test_bar.py
+++ b/pandas/tests/io/formats/style/test_bar.py
@@ -347,6 +347,7 @@ def test_styler_bar_with_NA_values():
def test_style_bar_with_pyarrow_NA_values():
+ pytest.importorskip("pyarrow")
data = """name,age,test1,test2,teacher
Adam,15,95.0,80,Ashby
Bob,16,81.0,82,Ashby
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 1bc227369a968..3d07c0219691e 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -28,11 +28,6 @@
read_json,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
-from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
from pandas.io.json import ujson_dumps
@@ -2143,12 +2138,10 @@ def test_json_uint64(self):
result = df.to_json(orient="split")
assert result == expected
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_read_json_dtype_backend(
self, string_storage, dtype_backend, orient, using_infer_string
):
# GH#50750
- pa = pytest.importorskip("pyarrow")
df = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2162,30 +2155,18 @@ def test_read_json_dtype_backend(
}
)
- if using_infer_string:
- string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
- elif string_storage == "python":
- string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
- string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
-
- elif dtype_backend == "pyarrow":
- pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
-
- else:
- string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
-
out = df.to_json(orient=orient)
with pd.option_context("mode.string_storage", string_storage):
result = read_json(
StringIO(out), dtype_backend=dtype_backend, orient=orient
)
+ if dtype_backend == "pyarrow":
+ pa = pytest.importorskip("pyarrow")
+ string_dtype = pd.ArrowDtype(pa.string())
+ else:
+ string_dtype = pd.StringDtype(string_storage)
+
expected = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2194,12 +2175,13 @@ def test_read_json_dtype_backend(
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": Series([True, False, NA], dtype="boolean"),
"f": Series([True, False, True], dtype="boolean"),
- "g": string_array,
- "h": string_array_na,
+ "g": Series(["a", "b", "c"], dtype=string_dtype),
+ "h": Series(["a", "b", None], dtype=string_dtype),
}
)
if dtype_backend == "pyarrow":
+ pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray
expected = DataFrame(
@@ -2212,7 +2194,9 @@ def test_read_json_dtype_backend(
if orient == "values":
expected.columns = list(range(8))
- tm.assert_frame_equal(result, expected)
+ # the storage of the str columns' Index is also affected by the
+ # string_storage setting -> ignore that for checking the result
+ tm.assert_frame_equal(result, expected, check_column_type=False)
@pytest.mark.parametrize("orient", ["split", "records", "index"])
def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
index 6d5f870f07206..90f77a7024235 100644
--- a/pandas/tests/io/parser/conftest.py
+++ b/pandas/tests/io/parser/conftest.py
@@ -4,6 +4,7 @@
import pytest
+from pandas.compat import HAS_PYARROW
from pandas.compat._optional import VERSIONS
from pandas import (
@@ -117,7 +118,15 @@ def csv1(datapath):
_py_parsers_only = [_pythonParser]
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
-_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)]
+_pyarrow_parsers_only = [
+ pytest.param(
+ _pyarrowParser,
+ marks=[
+ pytest.mark.single_cpu,
+ pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"),
+ ],
+ )
+]
_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
@@ -181,7 +190,16 @@ def _get_all_parser_float_precision_combinations():
parser = parser.values[0]
for precision in parser.float_precision_choices:
# Re-wrap in pytest.param for pyarrow
- mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else ()
+ mark = (
+ [
+ pytest.mark.single_cpu,
+ pytest.mark.skipif(
+ not HAS_PYARROW, reason="pyarrow is not installed"
+ ),
+ ]
+ if parser.engine == "pyarrow"
+ else ()
+ )
param = pytest.param((parser(), precision), marks=mark)
params.append(param)
ids.append(f"{parser_id}-{precision}")
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index 3f410a13c8f80..07f29518b7881 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -19,11 +19,7 @@
Timestamp,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- IntegerArray,
- StringArray,
-)
+from pandas.core.arrays import IntegerArray
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
@@ -463,11 +459,8 @@ def test_dtype_backend_and_dtype(all_parsers):
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_dtype_backend_string(all_parsers, string_storage):
# GH#36712
- pa = pytest.importorskip("pyarrow")
-
with pd.option_context("mode.string_storage", string_storage):
parser = all_parsers
@@ -477,21 +470,13 @@ def test_dtype_backend_string(all_parsers, string_storage):
"""
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
- if string_storage == "python":
- expected = DataFrame(
- {
- "a": StringArray(np.array(["a", "b"], dtype=np.object_)),
- "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
- }
- )
- else:
- expected = DataFrame(
- {
- "a": ArrowStringArray(pa.array(["a", "b"])),
- "b": ArrowStringArray(pa.array(["x", None])),
- }
- )
- tm.assert_frame_equal(result, expected)
+ expected = DataFrame(
+ {
+ "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
+ "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
+ },
+ )
+ tm.assert_frame_equal(result, expected)
def test_dtype_backend_ea_dtype_specified(all_parsers):
@@ -507,7 +492,6 @@ def test_dtype_backend_ea_dtype_specified(all_parsers):
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_dtype_backend_pyarrow(all_parsers, request):
# GH#36712
pa = pytest.importorskip("pyarrow")
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
index b7b4a77c9e048..6243185294894 100644
--- a/pandas/tests/io/parser/test_read_fwf.py
+++ b/pandas/tests/io/parser/test_read_fwf.py
@@ -13,8 +13,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.errors import EmptyDataError
import pandas as pd
@@ -23,10 +21,6 @@
DatetimeIndex,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
from pandas.io.common import urlopen
from pandas.io.parsers import (
@@ -941,39 +935,30 @@ def test_widths_and_usecols():
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_dtype_backend(string_storage, dtype_backend):
# GH#50289
- if string_storage == "python":
- arr = StringArray(np.array(["a", "b"], dtype=np.object_))
- arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
- elif dtype_backend == "pyarrow":
- pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- arr = ArrowExtensionArray(pa.array(["a", "b"]))
- arr_na = ArrowExtensionArray(pa.array([None, "a"]))
- else:
- pa = pytest.importorskip("pyarrow")
- arr = ArrowStringArray(pa.array(["a", "b"]))
- arr_na = ArrowStringArray(pa.array([None, "a"]))
-
data = """a b c d e f g h i
1 2.5 True a
3 4.5 False b True 6 7.5 a"""
with pd.option_context("mode.string_storage", string_storage):
result = read_fwf(StringIO(data), dtype_backend=dtype_backend)
+ if dtype_backend == "pyarrow":
+ pa = pytest.importorskip("pyarrow")
+ string_dtype = pd.ArrowDtype(pa.string())
+ else:
+ string_dtype = pd.StringDtype(string_storage)
+
expected = DataFrame(
{
"a": pd.Series([1, 3], dtype="Int64"),
"b": pd.Series([2.5, 4.5], dtype="Float64"),
"c": pd.Series([True, False], dtype="boolean"),
- "d": arr,
+ "d": pd.Series(["a", "b"], dtype=string_dtype),
"e": pd.Series([pd.NA, True], dtype="boolean"),
"f": pd.Series([pd.NA, 6], dtype="Int64"),
"g": pd.Series([pd.NA, 7.5], dtype="Float64"),
- "h": arr_na,
+ "h": pd.Series([None, "a"], dtype=string_dtype),
"i": pd.Series([pd.NA, pd.NA], dtype="Int64"),
}
)
@@ -989,7 +974,9 @@ def test_dtype_backend(string_storage, dtype_backend):
)
expected["i"] = ArrowExtensionArray(pa.array([None, None]))
- tm.assert_frame_equal(result, expected)
+ # the storage of the str columns' Index is also affected by the
+ # string_storage setting -> ignore that for checking the result
+ tm.assert_frame_equal(result, expected, check_column_type=False)
def test_invalid_dtype_backend():
diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py
index 923b880004c26..541cc39606047 100644
--- a/pandas/tests/io/test_clipboard.py
+++ b/pandas/tests/io/test_clipboard.py
@@ -19,10 +19,6 @@
read_clipboard,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
from pandas.io.clipboard import (
CheckedCall,
@@ -358,23 +354,15 @@ def test_read_clipboard_dtype_backend(
self, clipboard, string_storage, dtype_backend, engine
):
# GH#50502
- if string_storage == "pyarrow" or dtype_backend == "pyarrow":
- pa = pytest.importorskip("pyarrow")
-
- if string_storage == "python":
- string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
- string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
-
- elif dtype_backend == "pyarrow" and engine != "c":
+ if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["x", "y"]))
- string_array_na = ArrowExtensionArray(pa.array(["x", None]))
-
+ if engine == "c" and string_storage == "pyarrow":
+ # TODO avoid this exception?
+ string_dtype = pd.ArrowDtype(pa.large_string())
+ else:
+ string_dtype = pd.ArrowDtype(pa.string())
else:
- string_array = ArrowStringArray(pa.array(["x", "y"]))
- string_array_na = ArrowStringArray(pa.array(["x", None]))
+ string_dtype = pd.StringDtype(string_storage)
text = """a,b,c,d,e,f,g,h,i
x,1,4.0,x,2,4.0,,True,False
@@ -386,10 +374,10 @@ def test_read_clipboard_dtype_backend(
expected = DataFrame(
{
- "a": string_array,
+ "a": Series(["x", "y"], dtype=string_dtype),
"b": Series([1, 2], dtype="Int64"),
"c": Series([4.0, 5.0], dtype="Float64"),
- "d": string_array_na,
+ "d": Series(["x", None], dtype=string_dtype),
"e": Series([2, NA], dtype="Int64"),
"f": Series([4.0, NA], dtype="Float64"),
"g": Series([NA, NA], dtype="Int64"),
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
index 5aa8f1c69fe44..6dd4368f09cc8 100644
--- a/pandas/tests/io/test_feather.py
+++ b/pandas/tests/io/test_feather.py
@@ -9,10 +9,6 @@
import pandas as pd
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
from pandas.io.feather_format import read_feather, to_feather # isort:skip
@@ -184,25 +180,17 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
}
)
- if string_storage == "python":
- string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
- string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
-
- elif dtype_backend == "pyarrow":
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
-
- else:
- string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
-
with tm.ensure_clean() as path:
to_feather(df, path)
with pd.option_context("mode.string_storage", string_storage):
result = read_feather(path, dtype_backend=dtype_backend)
+ if dtype_backend == "pyarrow":
+ pa = pytest.importorskip("pyarrow")
+ string_dtype = pd.ArrowDtype(pa.string())
+ else:
+ string_dtype = pd.StringDtype(string_storage)
+
expected = pd.DataFrame(
{
"a": pd.Series([1, np.nan, 3], dtype="Int64"),
@@ -211,8 +199,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
"d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": pd.Series([True, False, pd.NA], dtype="boolean"),
"f": pd.Series([True, False, True], dtype="boolean"),
- "g": string_array,
- "h": string_array_na,
+ "g": pd.Series(["a", "b", "c"], dtype=string_dtype),
+ "h": pd.Series(["a", "b", None], dtype=string_dtype),
}
)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 164646aedf464..73e9933e3681b 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -13,8 +13,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.compat import is_platform_windows
import pandas.util._test_decorators as td
@@ -31,17 +29,9 @@
to_datetime,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
from pandas.io.common import file_path_to_url
-pytestmark = pytest.mark.xfail(
- using_string_dtype(), reason="TODO(infer_string)", strict=False
-)
-
@pytest.fixture(
params=[
@@ -156,7 +146,7 @@ def test_to_html_compat(self, flavor_read_html):
df = (
DataFrame(
np.random.default_rng(2).random((4, 3)),
- columns=pd.Index(list("abc"), dtype=object),
+ columns=pd.Index(list("abc")),
)
.map("{:.3f}".format)
.astype(float)
@@ -182,24 +172,16 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
}
)
- if string_storage == "python":
- string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
- string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
- elif dtype_backend == "pyarrow":
- pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
- else:
- pa = pytest.importorskip("pyarrow")
- string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
-
out = df.to_html(index=False)
with pd.option_context("mode.string_storage", string_storage):
result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0]
+ if dtype_backend == "pyarrow":
+ pa = pytest.importorskip("pyarrow")
+ string_dtype = pd.ArrowDtype(pa.string())
+ else:
+ string_dtype = pd.StringDtype(string_storage)
+
expected = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -208,8 +190,8 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": Series([True, False, NA], dtype="boolean"),
"f": Series([True, False, True], dtype="boolean"),
- "g": string_array,
- "h": string_array_na,
+ "g": Series(["a", "b", "c"], dtype=string_dtype),
+ "h": Series(["a", "b", None], dtype=string_dtype),
}
)
@@ -225,7 +207,9 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
}
)
- tm.assert_frame_equal(result, expected)
+ # the storage of the str columns' Index is also affected by the
+ # string_storage setting -> ignore that for checking the result
+ tm.assert_frame_equal(result, expected, check_column_type=False)
@pytest.mark.network
@pytest.mark.single_cpu
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 561c718ea5851..ec087eab0cf14 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -17,6 +17,7 @@
pa_version_under13p0,
pa_version_under15p0,
pa_version_under17p0,
+ pa_version_under18p0,
)
import pandas as pd
@@ -955,11 +956,16 @@ def test_timestamp_nanoseconds(self, pa):
def test_timezone_aware_index(self, request, pa, timezone_aware_date_list):
pytest.importorskip("pyarrow", "11.0.0")
- if timezone_aware_date_list.tzinfo != datetime.timezone.utc:
+ if (
+ timezone_aware_date_list.tzinfo != datetime.timezone.utc
+ and pa_version_under18p0
+ ):
request.applymarker(
pytest.mark.xfail(
- reason="temporary skip this test until it is properly resolved: "
- "https://github.com/pandas-dev/pandas/issues/37286"
+ reason=(
+ "pyarrow returns pytz.FixedOffset while pandas "
+ "constructs datetime.timezone https://github.com/pandas-dev/pandas/issues/37286"
+ )
)
)
idx = 5 * [timezone_aware_date_list]
@@ -1172,6 +1178,10 @@ def test_duplicate_columns(self, fp):
msg = "Cannot create parquet dataset with duplicate column names"
self.check_error_on_write(df, fp, ValueError, msg)
+ @pytest.mark.xfail(
+ Version(np.__version__) >= Version("2.0.0"),
+ reason="fastparquet uses np.float_ in numpy2",
+ )
def test_bool_with_none(self, fp):
df = pd.DataFrame({"a": [True, None, False]})
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index a21893f66722a..980c88f070b89 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -39,10 +39,6 @@
to_timedelta,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
from pandas.util.version import Version
from pandas.io import sql
@@ -3661,24 +3657,13 @@ def dtype_backend_data() -> DataFrame:
@pytest.fixture
def dtype_backend_expected():
- def func(storage, dtype_backend, conn_name) -> DataFrame:
- string_array: StringArray | ArrowStringArray
- string_array_na: StringArray | ArrowStringArray
- if storage == "python":
- string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
- string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
-
- elif dtype_backend == "pyarrow":
+ def func(string_storage, dtype_backend, conn_name) -> DataFrame:
+ string_dtype: pd.StringDtype | pd.ArrowDtype
+ if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment]
- string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment]
-
+ string_dtype = pd.ArrowDtype(pa.string())
else:
- pa = pytest.importorskip("pyarrow")
- string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
+ string_dtype = pd.StringDtype(string_storage)
df = DataFrame(
{
@@ -3688,8 +3673,8 @@ def func(storage, dtype_backend, conn_name) -> DataFrame:
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": Series([True, False, pd.NA], dtype="boolean"),
"f": Series([True, False, True], dtype="boolean"),
- "g": string_array,
- "h": string_array_na,
+ "g": Series(["a", "b", "c"], dtype=string_dtype),
+ "h": Series(["a", "b", None], dtype=string_dtype),
}
)
if dtype_backend == "pyarrow":
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 036a5d6265dd7..5c07a56c9fb3f 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -14,8 +14,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.compat import WASM
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
@@ -31,11 +29,6 @@
Series,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
-from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
from pandas.io.common import get_handle
from pandas.io.xml import read_xml
@@ -1992,7 +1985,6 @@ def test_s3_parser_consistency(s3_public_bucket_with_data, s3so):
tm.assert_frame_equal(df_lxml, df_etree)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_read_xml_nullable_dtypes(
parser, string_storage, dtype_backend, using_infer_string
):
@@ -2023,36 +2015,21 @@ def test_read_xml_nullable_dtypes(
"""
- if using_infer_string:
- pa = pytest.importorskip("pyarrow")
- string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"]))
- string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None]))
-
- elif string_storage == "python":
- string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
- string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
+ with pd.option_context("mode.string_storage", string_storage):
+ result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend)
- elif dtype_backend == "pyarrow":
+ if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["x", "y"]))
- string_array_na = ArrowExtensionArray(pa.array(["x", None]))
-
+ string_dtype = pd.ArrowDtype(pa.string())
else:
- pa = pytest.importorskip("pyarrow")
- string_array = ArrowStringArray(pa.array(["x", "y"]))
- string_array_na = ArrowStringArray(pa.array(["x", None]))
-
- with pd.option_context("mode.string_storage", string_storage):
- result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend)
+ string_dtype = pd.StringDtype(string_storage)
expected = DataFrame(
{
- "a": string_array,
+ "a": Series(["x", "y"], dtype=string_dtype),
"b": Series([1, 2], dtype="Int64"),
"c": Series([4.0, 5.0], dtype="Float64"),
- "d": string_array_na,
+ "d": Series(["x", None], dtype=string_dtype),
"e": Series([2, NA], dtype="Int64"),
"f": Series([4.0, NA], dtype="Float64"),
"g": Series([NA, NA], dtype="Int64"),
@@ -2073,7 +2050,9 @@ def test_read_xml_nullable_dtypes(
)
expected["g"] = ArrowExtensionArray(pa.array([None, None]))
- tm.assert_frame_equal(result, expected)
+ # the storage of the str columns' Index is also affected by the
+ # string_storage setting -> ignore that for checking the result
+ tm.assert_frame_equal(result, expected, check_column_type=False)
def test_invalid_dtype_backend():
diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py
index b381c4fce8430..b39f953da1ee6 100644
--- a/pandas/tests/plotting/frame/test_frame.py
+++ b/pandas/tests/plotting/frame/test_frame.py
@@ -45,6 +45,7 @@
_check_visible,
get_y_axis,
)
+from pandas.util.version import Version
from pandas.io.formats.printing import pprint_thing
@@ -2465,8 +2466,14 @@ def test_group_subplot_invalid_column_name(self):
d = {"a": np.arange(10), "b": np.arange(10)}
df = DataFrame(d)
- with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"):
- df.plot(subplots=[("a", "bad_name")])
+ if Version(np.__version__) < Version("2.0.0"):
+ with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"):
+ df.plot(subplots=[("a", "bad_name")])
+ else:
+ with pytest.raises(
+ ValueError, match=r"Column label\(s\) \[np\.str\_\('bad_name'\)\]"
+ ):
+ df.plot(subplots=[("a", "bad_name")])
def test_group_subplot_duplicated_column(self):
d = {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)}
diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index 66799732be064..26fecef6ed0e6 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -9,6 +9,8 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
import pandas as pd
from pandas import (
Categorical,
@@ -1204,6 +1206,9 @@ def test_idxminmax_object_dtype(self, using_infer_string):
with pytest.raises(TypeError, match=msg):
ser3.idxmin(skipna=False)
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_idxminmax_object_frame(self):
# GH#4279
df = DataFrame([["zimm", 2.5], ["biff", 1.0], ["bid", 12.0]])
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
index b2caa1fadd1a5..8af224f1ad64f 100644
--- a/pandas/tests/reshape/concat/test_concat.py
+++ b/pandas/tests/reshape/concat/test_concat.py
@@ -939,3 +939,14 @@ def test_concat_with_series_and_frame_returns_rangeindex_columns():
result = concat([ser, df])
expected = DataFrame([0, 1, 2], index=[0, 0, 1])
tm.assert_frame_equal(result, expected, check_column_type=True)
+
+
+def test_concat_with_moot_ignore_index_and_keys():
+ df1 = DataFrame([[0]])
+ df2 = DataFrame([[42]])
+
+ ignore_index = True
+ keys = ["df1", "df2"]
+ msg = f"Cannot set {ignore_index=} and specify keys. Either should be used."
+ with pytest.raises(ValueError, match=msg):
+ concat([df1, df2], keys=keys, ignore_index=ignore_index)
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index ad704d87a491b..cbee85f4aede9 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -2998,3 +2998,15 @@ def test_merge_datetime_and_timedelta(how):
)
with pytest.raises(ValueError, match=re.escape(msg)):
right.merge(left, on="key", how=how)
+
+
+def test_merge_on_all_nan_column():
+ # GH#59421
+ left = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6]})
+ right = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "zz": [4, 5, 6]})
+ result = left.merge(right, on=["x", "y"], how="outer")
+ # Should not trigger array bounds eerror with bounds checking or asan enabled.
+ expected = DataFrame(
+ {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]}
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py
index 5f769db7f8acf..b2e9f26e1c407 100644
--- a/pandas/tests/reshape/test_qcut.py
+++ b/pandas/tests/reshape/test_qcut.py
@@ -307,3 +307,15 @@ def test_qcut_nullable_integer(q, any_numeric_ea_dtype):
expected = qcut(arr.astype(float), q)
tm.assert_categorical_equal(result, expected)
+
+
+@pytest.mark.parametrize("scale", [1.0, 1 / 3, 17.0])
+@pytest.mark.parametrize("q", [3, 7, 9])
+@pytest.mark.parametrize("precision", [1, 3, 16])
+def test_qcut_contains(scale, q, precision):
+ # GH-59355
+ arr = (scale * np.arange(q + 1)).round(precision)
+ result = qcut(arr, q, precision=precision)
+
+ for value, bucket in zip(arr, result):
+ assert value in bucket
diff --git a/pandas/tests/scalar/timestamp/methods/test_round.py b/pandas/tests/scalar/timestamp/methods/test_round.py
index 2fb0e1a8d3397..944aa55727217 100644
--- a/pandas/tests/scalar/timestamp/methods/test_round.py
+++ b/pandas/tests/scalar/timestamp/methods/test_round.py
@@ -4,7 +4,6 @@
)
import numpy as np
import pytest
-import pytz
from pandas._libs import lib
from pandas._libs.tslibs import (
@@ -182,7 +181,7 @@ def test_round_dst_border_ambiguous(self, method, unit):
assert result is NaT
msg = "Cannot infer dst time"
- with pytest.raises(pytz.AmbiguousTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
getattr(ts, method)("h", ambiguous="raise")
@pytest.mark.parametrize(
@@ -205,7 +204,7 @@ def test_round_dst_border_nonexistent(self, method, ts_str, freq, unit):
assert result is NaT
msg = "2018-03-11 02:00:00"
- with pytest.raises(pytz.NonExistentTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
getattr(ts, method)(freq, nonexistent="raise")
@pytest.mark.parametrize(
diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py
index 90dc8d77608cb..cb7ac5fa6f1da 100644
--- a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py
+++ b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py
@@ -4,11 +4,6 @@
from dateutil.tz import gettz
import pytest
-import pytz
-from pytz.exceptions import (
- AmbiguousTimeError,
- NonExistentTimeError,
-)
from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
from pandas.errors import OutOfBoundsDatetime
@@ -54,13 +49,14 @@ def test_tz_localize_ambiguous_bool(self, unit, tz):
# make sure that we are correctly accepting bool values as ambiguous
# GH#14402
if isinstance(tz, str) and tz.startswith("pytz/"):
+ pytz = pytest.importorskip("pytz")
tz = pytz.timezone(tz.removeprefix("pytz/"))
ts = Timestamp("2015-11-01 01:00:03").as_unit(unit)
expected0 = Timestamp("2015-11-01 01:00:03-0500", tz=tz)
expected1 = Timestamp("2015-11-01 01:00:03-0600", tz=tz)
msg = "Cannot infer dst time from 2015-11-01 01:00:03"
- with pytest.raises(pytz.AmbiguousTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
ts.tz_localize(tz)
result = ts.tz_localize(tz, ambiguous=True)
@@ -105,10 +101,10 @@ def test_tz_localize_ambiguous(self):
def test_tz_localize_nonexistent(self, stamp, tz):
# GH#13057
ts = Timestamp(stamp)
- with pytest.raises(NonExistentTimeError, match=stamp):
+ with pytest.raises(ValueError, match=stamp):
ts.tz_localize(tz)
# GH 22644
- with pytest.raises(NonExistentTimeError, match=stamp):
+ with pytest.raises(ValueError, match=stamp):
ts.tz_localize(tz, nonexistent="raise")
assert ts.tz_localize(tz, nonexistent="NaT") is NaT
@@ -154,7 +150,7 @@ def test_tz_localize_ambiguous_raise(self):
# GH#13057
ts = Timestamp("2015-11-1 01:00")
msg = "Cannot infer dst time from 2015-11-01 01:00:00,"
- with pytest.raises(AmbiguousTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
ts.tz_localize("US/Pacific", ambiguous="raise")
def test_tz_localize_nonexistent_invalid_arg(self, warsaw):
@@ -330,7 +326,7 @@ def test_timestamp_tz_localize_nonexistent_raise(self, warsaw, unit):
tz = warsaw
ts = Timestamp("2015-03-29 02:20:00").as_unit(unit)
msg = "2015-03-29 02:20:00"
- with pytest.raises(pytz.NonExistentTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
ts.tz_localize(tz, nonexistent="raise")
msg = (
"The nonexistent argument must be one of 'raise', 'NaT', "
diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py
index 39f302c3357de..2c97c4a32e0aa 100644
--- a/pandas/tests/scalar/timestamp/test_constructors.py
+++ b/pandas/tests/scalar/timestamp/test_constructors.py
@@ -15,7 +15,6 @@
)
import numpy as np
import pytest
-import pytz
from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
from pandas.errors import OutOfBoundsDatetime
@@ -747,7 +746,7 @@ def test_constructor_tz_or_tzinfo(self):
tz="UTC",
),
Timestamp(2000, 1, 2, 3, 4, 5, 6, None, nanosecond=1),
- Timestamp(2000, 1, 2, 3, 4, 5, 6, tz=pytz.UTC, nanosecond=1),
+ Timestamp(2000, 1, 2, 3, 4, 5, 6, tz=timezone.utc, nanosecond=1),
],
)
def test_constructor_nanosecond(self, result):
@@ -904,7 +903,7 @@ def test_raise_tz_and_tzinfo_in_datetime_input(self, box):
Timestamp(box(**kwargs), tz="US/Pacific")
msg = "Cannot pass a datetime or Timestamp"
with pytest.raises(ValueError, match=msg):
- Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific"))
+ Timestamp(box(**kwargs), tzinfo=zoneinfo.ZoneInfo("US/Pacific"))
def test_dont_convert_dateutil_utc_to_default_utc(self):
result = Timestamp(datetime(2018, 1, 1), tz=tzutc())
@@ -948,7 +947,7 @@ def test_timestamp_constructor_near_dst_boundary(self):
assert result == expected
msg = "Cannot infer dst time from 2015-10-25 02:00:00"
- with pytest.raises(pytz.AmbiguousTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
Timestamp("2015-10-25 02:00", tz=tz)
result = Timestamp("2017-03-26 01:00", tz="Europe/Paris")
@@ -956,7 +955,7 @@ def test_timestamp_constructor_near_dst_boundary(self):
assert result == expected
msg = "2017-03-26 02:00"
- with pytest.raises(pytz.NonExistentTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
Timestamp("2017-03-26 02:00", tz="Europe/Paris")
# GH#11708
@@ -975,7 +974,7 @@ def test_timestamp_constructor_near_dst_boundary(self):
assert result == expected
msg = "2017-03-26 02:00"
- with pytest.raises(pytz.NonExistentTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
Timestamp("2017-03-26 02:00", tz="Europe/Paris")
result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris")
diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py
index 3e1ece6b7f59e..9b9a8ea3600ae 100644
--- a/pandas/tests/series/accessors/test_dt_accessor.py
+++ b/pandas/tests/series/accessors/test_dt_accessor.py
@@ -9,7 +9,6 @@
import numpy as np
import pytest
-import pytz
from pandas._config import using_string_dtype
@@ -28,6 +27,7 @@
Period,
PeriodIndex,
Series,
+ StringDtype,
TimedeltaIndex,
date_range,
period_range,
@@ -352,7 +352,7 @@ def test_dt_round_tz_ambiguous(self, method):
tm.assert_series_equal(result, expected)
# raise
- with tm.external_error_raised(pytz.AmbiguousTimeError):
+ with tm.external_error_raised(ValueError):
getattr(df1.date.dt, method)("h", ambiguous="raise")
@pytest.mark.parametrize(
@@ -374,7 +374,7 @@ def test_dt_round_tz_nonexistent(self, method, ts_str, freq):
expected = Series([pd.NaT]).dt.tz_localize(result.dt.tz)
tm.assert_series_equal(result, expected)
- with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"):
+ with pytest.raises(ValueError, match="2018-03-11 02:00:00"):
getattr(ser.dt, method)(freq, nonexistent="raise")
@pytest.mark.parametrize("freq", ["ns", "us", "1000us"])
@@ -514,7 +514,6 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale):
ser = pd.concat([ser, Series([pd.NaT])])
assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1])
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_strftime(self):
# GH 10086
ser = Series(date_range("20130101", periods=5))
@@ -585,10 +584,9 @@ def test_strftime_period_days(self, using_infer_string):
dtype="=U10",
)
if using_infer_string:
- expected = expected.astype("str")
+ expected = expected.astype(StringDtype(na_value=np.nan))
tm.assert_index_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_strftime_dt64_microsecond_resolution(self):
ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)])
result = ser.dt.strftime("%Y-%m-%d %H:%M:%S")
@@ -621,7 +619,6 @@ def test_strftime_period_minutes(self):
)
tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"data",
[
@@ -644,7 +641,7 @@ def test_strftime_all_nat(self, data):
ser = Series(data)
with tm.assert_produces_warning(None):
result = ser.dt.strftime("%Y-%m-%d")
- expected = Series([np.nan], dtype=object)
+ expected = Series([np.nan], dtype="str")
tm.assert_series_equal(result, expected)
def test_valid_dt_with_missing_values(self):
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
index 742091d761d62..71ba2dab671ef 100644
--- a/pandas/tests/series/indexing/test_setitem.py
+++ b/pandas/tests/series/indexing/test_setitem.py
@@ -10,6 +10,7 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import IndexingError
@@ -822,6 +823,11 @@ def test_mask_key(self, obj, key, expected, raises, val, indexer_sli):
else:
indexer_sli(obj)[mask] = val
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW,
+ reason="TODO(infer_string)",
+ strict=False,
+ )
def test_series_where(self, obj, key, expected, raises, val, is_inplace):
mask = np.zeros(obj.shape, dtype=bool)
mask[key] = True
diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py
index 4a8af259b4134..90c4056a39e84 100644
--- a/pandas/tests/series/methods/test_convert_dtypes.py
+++ b/pandas/tests/series/methods/test_convert_dtypes.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas._libs import lib
import pandas as pd
@@ -12,7 +10,6 @@
class TestSeriesConvertDtypes:
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"data, maindtype, expected_default, expected_other",
[
@@ -223,9 +220,9 @@ def test_convert_dtypes(
and params[0]
and not params[1]
):
- # If we would convert with convert strings then infer_objects converts
- # with the option
- expected_dtype = "string[pyarrow_numpy]"
+ # If convert_string=False and infer_objects=True, we end up with the
+ # default string dtype instead of preserving object for string data
+ expected_dtype = pd.StringDtype(na_value=np.nan)
expected = pd.Series(data, dtype=expected_dtype)
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_tz_localize.py b/pandas/tests/series/methods/test_tz_localize.py
index 45620a721f442..53288e8a1f8e7 100644
--- a/pandas/tests/series/methods/test_tz_localize.py
+++ b/pandas/tests/series/methods/test_tz_localize.py
@@ -1,7 +1,6 @@
from datetime import timezone
import pytest
-import pytz
from pandas._libs.tslibs import timezones
@@ -28,7 +27,7 @@ def test_series_tz_localize_ambiguous_bool(self):
expected0 = Series([expected0])
expected1 = Series([expected1])
- with tm.external_error_raised(pytz.AmbiguousTimeError):
+ with tm.external_error_raised(ValueError):
ser.dt.tz_localize("US/Central")
result = ser.dt.tz_localize("US/Central", ambiguous=True)
@@ -79,11 +78,11 @@ def test_tz_localize_nonexistent(self, warsaw, method, exp, unit):
df = ser.to_frame()
if method == "raise":
- with tm.external_error_raised(pytz.NonExistentTimeError):
+ with tm.external_error_raised(ValueError):
dti.tz_localize(tz, nonexistent=method)
- with tm.external_error_raised(pytz.NonExistentTimeError):
+ with tm.external_error_raised(ValueError):
ser.tz_localize(tz, nonexistent=method)
- with tm.external_error_raised(pytz.NonExistentTimeError):
+ with tm.external_error_raised(ValueError):
df.tz_localize(tz, nonexistent=method)
elif exp == "invalid":
diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py
index a63ffbbd3a5a1..79a55eb357f87 100644
--- a/pandas/tests/series/test_api.py
+++ b/pandas/tests/series/test_api.py
@@ -4,6 +4,10 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
+
import pandas as pd
from pandas import (
DataFrame,
@@ -160,6 +164,9 @@ def test_attrs(self):
result = s + 1
assert result.attrs == {"version": 1}
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_inspect_getmembers(self):
# GH38782
pytest.importorskip("jinja2")
diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py
index 939bf888fd61b..baed3ba936699 100644
--- a/pandas/tests/series/test_logical_ops.py
+++ b/pandas/tests/series/test_logical_ops.py
@@ -6,7 +6,10 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
+
from pandas import (
+ ArrowDtype,
DataFrame,
Index,
Series,
@@ -143,6 +146,9 @@ def test_logical_operators_int_dtype_with_bool(self):
expected = Series([False, True, True, True])
tm.assert_series_equal(result, expected)
+ @pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+ )
def test_logical_operators_int_dtype_with_object(self, using_infer_string):
# GH#9016: support bitwise op for integer types
s_0123 = Series(range(4), dtype="int64")
@@ -518,18 +524,38 @@ def test_int_dtype_different_index_not_bool(self):
result = ser1 ^ ser2
tm.assert_series_equal(result, expected)
+ # TODO: this belongs in comparison tests
def test_pyarrow_numpy_string_invalid(self):
# GH#56008
- pytest.importorskip("pyarrow")
+ pa = pytest.importorskip("pyarrow")
ser = Series([False, True])
ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]")
result = ser == ser2
- expected = Series(False, index=ser.index)
- tm.assert_series_equal(result, expected)
+ expected_eq = Series(False, index=ser.index)
+ tm.assert_series_equal(result, expected_eq)
result = ser != ser2
- expected = Series(True, index=ser.index)
- tm.assert_series_equal(result, expected)
+ expected_ne = Series(True, index=ser.index)
+ tm.assert_series_equal(result, expected_ne)
with pytest.raises(TypeError, match="Invalid comparison"):
ser > ser2
+
+ # GH#59505
+ ser3 = ser2.astype("string[pyarrow]")
+ result3_eq = ser3 == ser
+ tm.assert_series_equal(result3_eq, expected_eq.astype("bool[pyarrow]"))
+ result3_ne = ser3 != ser
+ tm.assert_series_equal(result3_ne, expected_ne.astype("bool[pyarrow]"))
+
+ with pytest.raises(TypeError, match="Invalid comparison"):
+ ser > ser3
+
+ ser4 = ser2.astype(ArrowDtype(pa.string()))
+ result4_eq = ser4 == ser
+ tm.assert_series_equal(result4_eq, expected_eq.astype("bool[pyarrow]"))
+ result4_ne = ser4 != ser
+ tm.assert_series_equal(result4_ne, expected_ne.astype("bool[pyarrow]"))
+
+ with pytest.raises(TypeError, match="Invalid comparison"):
+ ser > ser4
diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py
index 0bc3092d30b43..7bbb902e14a36 100644
--- a/pandas/tests/series/test_reductions.py
+++ b/pandas/tests/series/test_reductions.py
@@ -1,6 +1,10 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
+
import pandas as pd
from pandas import Series
import pandas._testing as tm
@@ -162,6 +166,9 @@ def test_validate_stat_keepdims():
np.sum(ser, keepdims=True)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
def test_mean_with_convertible_string_raises(using_infer_string):
# GH#44008
ser = Series(["1", "2"])
@@ -181,6 +188,9 @@ def test_mean_with_convertible_string_raises(using_infer_string):
df.mean()
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
def test_mean_dont_convert_j_to_complex():
# GH#36703
df = pd.DataFrame([{"db": "J", "numeric": 123}])
@@ -199,6 +209,9 @@ def test_mean_dont_convert_j_to_complex():
np.mean(df["db"].astype("string").array)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
def test_median_with_convertible_string_raises():
# GH#34671 this _could_ return a string "2", but definitely not float 2.0
msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support"
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
index ee26fdae74960..18df76ddd8ed8 100644
--- a/pandas/tests/test_downstream.py
+++ b/pandas/tests/test_downstream.py
@@ -218,7 +218,7 @@ def test_missing_required_dependency():
subprocess.check_output(call, stderr=subprocess.STDOUT)
output = exc.value.stdout.decode()
- for name in ["numpy", "pytz", "dateutil"]:
+ for name in ["numpy", "dateutil"]:
assert name in output
diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py
index 2a225bda953cf..869d41efa6c28 100644
--- a/pandas/tests/test_sorting.py
+++ b/pandas/tests/test_sorting.py
@@ -408,6 +408,13 @@ def test_codes_out_of_bound(self):
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_codes, expected_codes)
+ @pytest.mark.parametrize("codes", [[-1, -1], [2, -1], [2, 2]])
+ def test_codes_empty_array_out_of_bound(self, codes):
+ empty_values = np.array([])
+ expected_codes = -np.ones_like(codes, dtype=np.intp)
+ _, result_codes = safe_sort(empty_values, codes)
+ tm.assert_numpy_array_equal(result_codes, expected_codes)
+
def test_mixed_integer(self):
values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object)
result = safe_sort(values)
diff --git a/pandas/tests/tseries/offsets/test_dst.py b/pandas/tests/tseries/offsets/test_dst.py
index dfdc69c0fe18e..e75958843040d 100644
--- a/pandas/tests/tseries/offsets/test_dst.py
+++ b/pandas/tests/tseries/offsets/test_dst.py
@@ -108,13 +108,13 @@ def _test_offset(
"second": "2013-11-03 01:59:01.999999",
"microsecond": "2013-11-03 01:59:59.000001",
}[offset_name]
- with pytest.raises(pytz.AmbiguousTimeError, match=err_msg):
+ with pytest.raises(ValueError, match=err_msg):
tstart + offset
# While we're here, let's check that we get the same behavior in a
# vectorized path
dti = DatetimeIndex([tstart])
warn_msg = "Non-vectorized DateOffset"
- with pytest.raises(pytz.AmbiguousTimeError, match=err_msg):
+ with pytest.raises(ValueError, match=err_msg):
with tm.assert_produces_warning(performance_warning, match=warn_msg):
dti + offset
return
@@ -256,10 +256,10 @@ def test_all_offset_classes(self, tup):
],
)
def test_nontick_offset_with_ambiguous_time_error(original_dt, target_dt, offset, tz):
- # .apply for non-Tick offsets throws AmbiguousTimeError when the target dt
+ # .apply for non-Tick offsets throws ValueError when the target dt
# is dst-ambiguous
- localized_dt = original_dt.tz_localize(pytz.timezone(tz))
+ localized_dt = original_dt.tz_localize(tz)
msg = f"Cannot infer dst time from {target_dt}, try using the 'ambiguous' argument"
- with pytest.raises(pytz.AmbiguousTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
localized_dt + offset
diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py
index 99a6a583dd3e9..943434e515828 100644
--- a/pandas/tests/tseries/offsets/test_offsets_properties.py
+++ b/pandas/tests/tseries/offsets/test_offsets_properties.py
@@ -13,7 +13,6 @@
given,
)
import pytest
-import pytz
import pandas as pd
from pandas._testing._hypothesis import (
@@ -34,11 +33,11 @@ def test_on_offset_implementations(dt, offset):
# (dt + offset) - offset == dt
try:
compare = (dt + offset) - offset
- except (pytz.NonExistentTimeError, pytz.AmbiguousTimeError):
+ except ValueError:
# When dt + offset does not exist or is DST-ambiguous, assume(False) to
# indicate to hypothesis that this is not a valid test case
# DST-ambiguous example (GH41906):
- # dt = datetime.datetime(1900, 1, 1, tzinfo=pytz.timezone('Africa/Kinshasa'))
+ # dt = datetime.datetime(1900, 1, 1, tzinfo=ZoneInfo('Africa/Kinshasa'))
# offset = MonthBegin(66)
assume(False)
diff --git a/pandas/tests/tslibs/test_tzconversion.py b/pandas/tests/tslibs/test_tzconversion.py
index c1a56ffb71b02..f32829b4e0b21 100644
--- a/pandas/tests/tslibs/test_tzconversion.py
+++ b/pandas/tests/tslibs/test_tzconversion.py
@@ -1,6 +1,7 @@
+import zoneinfo
+
import numpy as np
import pytest
-import pytz
from pandas._libs.tslibs.tzconversion import tz_localize_to_utc
@@ -11,13 +12,15 @@ def test_tz_localize_to_utc_ambiguous_infer(self):
val = 1_320_541_200_000_000_000
vals = np.array([val, val - 1, val], dtype=np.int64)
- with pytest.raises(pytz.AmbiguousTimeError, match="2011-11-06 01:00:00"):
- tz_localize_to_utc(vals, pytz.timezone("US/Eastern"), ambiguous="infer")
+ with pytest.raises(ValueError, match="2011-11-06 01:00:00"):
+ tz_localize_to_utc(vals, zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer")
- with pytest.raises(pytz.AmbiguousTimeError, match="are no repeated times"):
- tz_localize_to_utc(vals[:1], pytz.timezone("US/Eastern"), ambiguous="infer")
+ with pytest.raises(ValueError, match="are no repeated times"):
+ tz_localize_to_utc(
+ vals[:1], zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer"
+ )
vals[1] += 1
msg = "There are 2 dst switches when there should only be 1"
- with pytest.raises(pytz.AmbiguousTimeError, match=msg):
- tz_localize_to_utc(vals, pytz.timezone("US/Eastern"), ambiguous="infer")
+ with pytest.raises(ValueError, match=msg):
+ tz_localize_to_utc(vals, zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer")
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
index af3194b5085c4..17b92427f0d5d 100644
--- a/pandas/tests/window/test_rolling.py
+++ b/pandas/tests/window/test_rolling.py
@@ -6,7 +6,10 @@
import numpy as np
import pytest
+from pandas._config import using_string_dtype
+
from pandas.compat import (
+ HAS_PYARROW,
IS64,
is_platform_arm,
is_platform_power,
@@ -1326,6 +1329,9 @@ def test_rolling_corr_timedelta_index(index, window):
tm.assert_almost_equal(result, expected)
+@pytest.mark.xfail(
+ using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
+)
def test_groupby_rolling_nan_included():
# GH 35542
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py
index 7e18ebe40cfa8..bd20660bdbba6 100644
--- a/pandas/util/_print_versions.py
+++ b/pandas/util/_print_versions.py
@@ -67,7 +67,6 @@ def _get_dependency_info() -> dict[str, JSONSerializable]:
"pandas",
# required
"numpy",
- "pytz",
"dateutil",
# install / build,
"pip",
diff --git a/pyproject.toml b/pyproject.toml
index cc5cc1cf84d0c..645ded35f3d18 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,6 @@ dependencies = [
"numpy>=1.23.5; python_version<'3.12'",
"numpy>=1.26.0; python_version>='3.12'",
"python-dateutil>=2.8.2",
- "pytz>=2020.1",
"tzdata>=2022.7"
]
classifiers = [
@@ -81,6 +80,7 @@ plot = ['matplotlib>=3.6.3']
output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.9.0']
clipboard = ['PyQt5>=5.15.9', 'qtpy>=2.3.0']
compression = ['zstandard>=0.19.0']
+timezone = ['pytz>=2023.4']
all = ['adbc-driver-postgresql>=0.10.0',
'adbc-driver-sqlite>=0.8.0',
'beautifulsoup4>=4.11.2',
@@ -107,6 +107,7 @@ all = ['adbc-driver-postgresql>=0.10.0',
'pytest>=7.3.2',
'pytest-xdist>=3.4.0',
'python-calamine>=0.1.7',
+ 'pytz>=2023.4',
'pyxlsb>=1.0.10',
'qtpy>=2.3.0',
'scipy>=1.10.0',
diff --git a/requirements-dev.txt b/requirements-dev.txt
index dbfd7c6bf7bf5..52d2553fc4001 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -15,7 +15,6 @@ PyQt5>=5.15.9
coverage
python-dateutil
numpy<2
-pytz
beautifulsoup4>=4.11.2
blosc
bottleneck>=1.3.6
@@ -39,6 +38,7 @@ pymysql>=1.0.2
pyreadstat>=1.2.0
tables>=3.8.0
python-calamine>=0.1.7
+pytz>=2023.4
pyxlsb>=1.0.10
s3fs>=2022.11.0
scipy>=1.10.0
diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html
index aa4bfc92ce8a8..4c66f28818abd 100644
--- a/web/pandas/_templates/layout.html
+++ b/web/pandas/_templates/layout.html
@@ -73,8 +73,8 @@
-
-
+
+
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
index 49ece5564c300..c14996211bb8b 100644
--- a/web/pandas/community/ecosystem.md
+++ b/web/pandas/community/ecosystem.md
@@ -360,6 +360,13 @@ Deltalake python package lets you access tables stored in
JVM. It provides the ``delta_table.to_pyarrow_table().to_pandas()`` method to convert
any Delta table into Pandas dataframe.
+### [pandas-gbq](https://github.com/googleapis/python-bigquery-pandas)
+
+pandas-gbq provides high performance reads and writes to and from
+[Google BigQuery](https://cloud.google.com/bigquery/). Previously (before version 2.2.0),
+these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`.
+Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead.
+
## Out-of-core
### [Bodo](https://bodo.ai/)
@@ -513,6 +520,13 @@ Arrays](https://awkward-array.org/) inside pandas' Series and
DataFrame. It also provides an accessor for using awkward functions
on Series that are of awkward type.
+### [db-dtypes](https://github.com/googleapis/python-db-dtypes-pandas)
+
+db-dtypes provides an extension types for working with types like
+DATE, TIME, and JSON from database systems. This package is used
+by pandas-gbq to provide natural dtypes for BigQuery data types without
+a natural numpy type.
+
### [Pandas-Genomics](https://pandas-genomics.readthedocs.io/en/latest/)
Pandas-Genomics provides an extension type and extension array for working