diff --git a/.circleci/config.yml b/.circleci/config.yml index 27b6829dcda70..9c986e5b1b054 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -92,7 +92,13 @@ jobs: no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | pip3 install cibuildwheel==2.20.0 - cibuildwheel --output-dir wheelhouse + if [[ $CIBW_BUILD == cp313t* ]]; then + # TODO: temporarily run 3.13 free threaded builds without build isolation + # since we need pre-release cython + CIBW_BUILD_FRONTEND="pip; args: --no-build-isolation" cibuildwheel --output-dir wheelhouse + else + cibuildwheel --output-dir wheelhouse + fi environment: CIBW_BUILD: << parameters.cibw-build >> @@ -141,6 +147,10 @@ workflows: cibw-build: ["cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64", + "cp313-manylinux_aarch64", + "cp313t-manylinux_aarch64", "cp310-musllinux_aarch64", "cp311-musllinux_aarch64", - "cp312-musllinux_aarch64",] + "cp312-musllinux_aarch64", + "cp313-musllinux_aarch64", + "cp313t-musllinux_aarch64"] diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 3eb68bdd2a15c..4fe901998cbcc 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -9,6 +9,8 @@ runs: - name: Install ${{ inputs.environment-file }} uses: mamba-org/setup-micromamba@v1 with: + # Pinning to avoid 2.0 failures + micromamba-version: '1.5.10-0' environment-file: ${{ inputs.environment-file }} environment-name: test condarc-file: ci/.condarc diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 7e9c056e75131..e1d2d1ea846b8 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.2.x + - 2.3.x pull_request: branches: - main - - 2.2.x + - 2.3.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 47b97fa57852a..908baa87815ab 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,13 +4,13 @@ on: push: branches: - main - - 2.2.x + - 2.3.x tags: - '*' pull_request: branches: - main - - 2.2.x + - 2.3.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 97f90c1588962..6748832903e30 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.2.x + - 2.3.x pull_request: branches: - main - - 2.2.x + - 2.3.x types: [ labeled, opened, synchronize, reopened ] permissions: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index d392c84be66fe..60b234d613a38 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.2.x + - 2.3.x pull_request: branches: - main - - 2.2.x + - 2.3.x paths-ignore: - "doc/**" - "web/**" @@ -380,7 +380,7 @@ jobs: fetch-depth: 0 - name: Set up Python Free-threading Version - uses: deadsnakes/action@v3.1.0 + uses: deadsnakes/action@v3.2.0 with: python-version: 3.13-dev nogil: true diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 67d8715f72614..2aaec8c9b56b0 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -102,9 +102,7 @@ jobs: python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] include: # TODO: Remove this plus installing build deps in cibw_before_build.sh - # and test deps in cibw_before_test.sh after pandas can be built with a released NumPy/Cython - - python: ["cp313", "3.13"] - cibw_build_frontend: 'pip; args: --no-build-isolation' + # after pandas can be built with a released NumPy/Cython - python: ["cp313t", "3.13"] cibw_build_frontend: 'pip; args: --no-build-isolation' # Build Pyodide wheels and upload them to Anaconda.org @@ -187,11 +185,9 @@ jobs: - name: Test Windows Wheels if: ${{ matrix.buildplat[1] == 'win_amd64' }} shell: pwsh - # TODO: Remove NumPy nightly install when there's a 3.13 wheel on PyPI run: | $TST_CMD = @" python -m pip install hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0; - ${{ matrix.python[1] == '3.13' && 'python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy;' }} python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ diff --git a/MANIFEST.in b/MANIFEST.in index f586d457eaaf8..a7d7d7eb4e062 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -65,4 +65,3 @@ graft pandas/_libs/include # Include cibw script in sdist since it's needed for building wheels include scripts/cibw_before_build.sh -include scripts/cibw_before_test.sh diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7ed5103b3b796..16a3a22bc4876 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,14 +70,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.NA SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.PeriodDtype.freq SA01" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.RangeIndex.start SA01" \ - -i "pandas.RangeIndex.step SA01" \ - -i "pandas.RangeIndex.stop SA01" \ -i "pandas.Series.cat.add_categories PR01,PR02" \ -i "pandas.Series.cat.as_ordered PR01" \ -i "pandas.Series.cat.as_unordered PR01" \ @@ -92,10 +87,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ -i "pandas.Series.dt.month_name PR01,PR02" \ - -i "pandas.Series.dt.nanoseconds SA01" \ -i "pandas.Series.dt.normalize PR01" \ -i "pandas.Series.dt.round PR01,PR02" \ - -i "pandas.Series.dt.seconds SA01" \ -i "pandas.Series.dt.strftime PR01,PR02" \ -i "pandas.Series.dt.to_period PR01,PR02" \ -i "pandas.Series.dt.total_seconds PR01" \ @@ -103,97 +96,51 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.sparse.fill_value SA01" \ -i "pandas.Series.sparse.from_coo PR07,SA01" \ - -i "pandas.Series.sparse.npoints SA01" \ - -i "pandas.Series.sparse.sp_values SA01" \ - -i "pandas.Timedelta.components SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ - -i "pandas.Timedelta.to_timedelta64 SA01" \ - -i "pandas.Timedelta.total_seconds SA01" \ - -i "pandas.TimedeltaIndex.nanoseconds SA01" \ - -i "pandas.TimedeltaIndex.seconds SA01" \ - -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.nanosecond GL08" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \ - -i "pandas.api.types.is_bool PR01,SA01" \ - -i "pandas.api.types.is_categorical_dtype SA01" \ - -i "pandas.api.types.is_complex PR01,SA01" \ - -i "pandas.api.types.is_complex_dtype SA01" \ - -i "pandas.api.types.is_datetime64_dtype SA01" \ - -i "pandas.api.types.is_datetime64_ns_dtype SA01" \ - -i "pandas.api.types.is_datetime64tz_dtype SA01" \ - -i "pandas.api.types.is_dict_like PR07,SA01" \ - -i "pandas.api.types.is_extension_array_dtype SA01" \ - -i "pandas.api.types.is_file_like PR07,SA01" \ -i "pandas.api.types.is_float PR01,SA01" \ - -i "pandas.api.types.is_float_dtype SA01" \ - -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ - -i "pandas.api.types.is_int64_dtype SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ - -i "pandas.api.types.is_integer_dtype SA01" \ - -i "pandas.api.types.is_interval_dtype SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ - -i "pandas.api.types.is_list_like SA01" \ - -i "pandas.api.types.is_named_tuple PR07,SA01" \ - -i "pandas.api.types.is_object_dtype SA01" \ - -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.BooleanArray SA01" \ -i "pandas.arrays.DatetimeArray SA01" \ - -i "pandas.arrays.FloatingArray SA01" \ -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.left SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ - -i "pandas.arrays.IntervalArray.mid SA01" \ -i "pandas.arrays.IntervalArray.right SA01" \ -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.SparseArray PR07,SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \ - -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.filter SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.max SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.min SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \ -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.filter PR01,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \ -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.max SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.min SA01" \ -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \ -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \ - -i "pandas.core.resample.Resampler.ffill RT03" \ -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ -i "pandas.core.resample.Resampler.groups SA01" \ -i "pandas.core.resample.Resampler.indices SA01" \ @@ -208,24 +155,19 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.sum SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.date_range RT03" \ -i "pandas.errors.AttributeConflictWarning SA01" \ -i "pandas.errors.CSSWarning SA01" \ -i "pandas.errors.CategoricalConversionWarning SA01" \ -i "pandas.errors.ChainedAssignmentError SA01" \ - -i "pandas.errors.ClosedFileError SA01" \ -i "pandas.errors.DataError SA01" \ -i "pandas.errors.DuplicateLabelError SA01" \ - -i "pandas.errors.EmptyDataError SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ -i "pandas.errors.InvalidIndexError SA01" \ -i "pandas.errors.InvalidVersion SA01" \ - -i "pandas.errors.MergeError SA01" \ -i "pandas.errors.NullFrequencyError SA01" \ -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ -i "pandas.errors.OptionError SA01" \ - -i "pandas.errors.OutOfBoundsDatetime SA01" \ -i "pandas.errors.OutOfBoundsTimedelta SA01" \ -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.PossibleDataLossError SA01" \ @@ -237,17 +179,14 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ - -i "pandas.io.stata.StataReader.data_label SA01" \ -i "pandas.io.stata.StataReader.value_labels RT03,SA01" \ -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ - -i "pandas.period_range RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ - -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ @@ -399,7 +338,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.SemiMonthEnd SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \ @@ -413,7 +351,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.Week.n GL08" \ -i "pandas.tseries.offsets.Week.normalize GL08" \ -i "pandas.tseries.offsets.Week.weekday GL08" \ - -i "pandas.tseries.offsets.WeekOfMonth SA01" \ -i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.n GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \ diff --git a/doc/source/conf.py b/doc/source/conf.py index 77dd5d03d311c..ddbda0aa3bf65 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -254,7 +254,9 @@ "json_url": "https://pandas.pydata.org/versions.json", "version_match": switcher_version, }, - "show_version_warning_banner": True, + # This shows a warning for patch releases since the + # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be) + "show_version_warning_banner": False, "icon_links": [ { "name": "Mastodon", diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index fe5271dab7132..4d99f282aa695 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -305,15 +305,15 @@ It is important to periodically update your local ``main`` branch with updates f branch and update your development environment to reflect any changes to the various packages that are used during development. -If using :ref:`mamba `, run: +If using :ref:`conda `, run: .. code-block:: shell git checkout main git fetch upstream git merge upstream/main - mamba activate pandas-dev - mamba env update -f environment.yml --prune + conda activate pandas-dev + conda env update -f environment.yml --prune If using :ref:`pip ` , do: diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 9d5a992e911b6..670ffe6996302 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -244,7 +244,7 @@ in your python environment. .. warning:: - * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment ` or select a `recently succeeded workflow `_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs. + * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment ` or select a `recently succeeded workflow `_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs. .. _contributing.ci: diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 643021db7b823..1426d3a84a748 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -43,7 +43,7 @@ and consult the ``Linux`` instructions below. **macOS** -To use the :ref:`mamba `-based compilers, you will need to install the +To use the :ref:`conda `-based compilers, you will need to install the Developer Tools using ``xcode-select --install``. If you prefer to use a different compiler, general information can be found here: @@ -51,9 +51,9 @@ https://devguide.python.org/setup/#macos **Linux** -For Linux-based :ref:`mamba ` installations, you won't have to install any -additional components outside of the mamba environment. The instructions -below are only needed if your setup isn't based on mamba environments. +For Linux-based :ref:`conda ` installations, you won't have to install any +additional components outside of the conda environment. The instructions +below are only needed if your setup isn't based on conda environments. Some Linux distributions will come with a pre-installed C compiler. To find out which compilers (and versions) are installed on your system:: @@ -82,19 +82,18 @@ Before we begin, please: * Make sure that you have :any:`cloned the repository ` * ``cd`` to the pandas source directory you just created with the clone command -.. _contributing.mamba: +.. _contributing.conda: -Option 1: using mamba (recommended) +Option 1: using conda (recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Install miniforge to get `mamba `_ -* Make sure your mamba is up to date (``mamba update mamba``) -* Create and activate the ``pandas-dev`` mamba environment using the following commands: +* Install miniforge to get `conda `_ +* Create and activate the ``pandas-dev`` conda environment using the following commands: -.. code-block:: none +.. code-block:: bash - mamba env create --file environment.yml - mamba activate pandas-dev + conda env create --file environment.yml + conda activate pandas-dev .. _contributing.pip: diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 50d380cab1d50..1e4a851d0e72d 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -344,7 +344,7 @@ in the next places: - Git repo with a `new tag `_ - Source distribution in a `GitHub release `_ - Pip packages in the `PyPI `_ -- Conda/Mamba packages in `conda-forge `_ +- Conda packages in `conda-forge `_ The process for releasing a new version of pandas is detailed next section. diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index 36ed553d9d88e..a17699a71fbd3 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -17,8 +17,7 @@ Installation :columns: 12 12 6 6 :padding: 3 - pandas is part of the `Anaconda `__ - distribution and can be installed with Anaconda or Miniconda: + pandas can be installed via conda from `conda-forge `__. ++++++++++++++++++++++ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 8e6cb9e9a132d..b3982c4ad091f 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -6,15 +6,16 @@ Installation ============ -The easiest way to install pandas is to install it -as part of the `Anaconda `__ distribution, a -cross platform distribution for data analysis and scientific computing. -The `Conda `__ package manager is the -recommended installation method for most users. +The pandas development team officially distributes pandas for installation +through the following methods: -Instructions for installing :ref:`from source `, -:ref:`PyPI `, or a -:ref:`development version ` are also provided. +* Available on `conda-forge `__ for installation with the conda package manager. +* Available on `PyPI `__ for installation with pip. +* Available on `Github `__ for installation from source. + +.. note:: + pandas may be installable from other sources besides the ones listed above, + but they are **not** managed by the pandas development team. .. _install.version: @@ -26,68 +27,54 @@ See :ref:`Python support policy `. Installing pandas ----------------- -.. _install.anaconda: +.. _install.conda: -Installing with Anaconda -~~~~~~~~~~~~~~~~~~~~~~~~ +Installing with Conda +~~~~~~~~~~~~~~~~~~~~~ -For users that are new to Python, the easiest way to install Python, pandas, and the -packages that make up the `PyData `__ stack -(`SciPy `__, `NumPy `__, -`Matplotlib `__, `and more `__) -is with `Anaconda `__, a cross-platform -(Linux, macOS, Windows) Python distribution for data analytics and -scientific computing. Installation instructions for Anaconda -`can be found here `__. +For users working with the `Conda `__ package manager, +pandas can be installed from the ``conda-forge`` channel. -.. _install.miniconda: +.. code-block:: shell -Installing with Miniconda -~~~~~~~~~~~~~~~~~~~~~~~~~ + conda install -c conda-forge pandas -For users experienced with Python, the recommended way to install pandas with -`Miniconda `__. -Miniconda allows you to create a minimal, self-contained Python installation compared to Anaconda and use the -`Conda `__ package manager to install additional packages -and create a virtual environment for your installation. Installation instructions for Miniconda -`can be found here `__. +To install the Conda package manager on your system, the +`Miniforge distribution `__ +is recommended. -The next step is to create a new conda environment. A conda environment is like a -virtualenv that allows you to specify a specific version of Python and set of libraries. -Run the following commands from a terminal window. +Additionally, it is recommended to install and run pandas from a virtual environment. .. code-block:: shell conda create -c conda-forge -n name_of_my_env python pandas - -This will create a minimal environment with only Python and pandas installed. -To put your self inside this environment run. - -.. code-block:: shell - + # On Linux or MacOS source activate name_of_my_env # On Windows activate name_of_my_env -.. _install.pypi: +.. tip:: + For users that are new to Python, the easiest way to install Python, pandas, and the + packages that make up the `PyData `__ stack such as + `SciPy `__, `NumPy `__ and + `Matplotlib `__ + is with `Anaconda `__, a cross-platform + (Linux, macOS, Windows) Python distribution for data analytics and + scientific computing. -Installing from PyPI -~~~~~~~~~~~~~~~~~~~~ + However, pandas from Anaconda is **not** officially managed by the pandas development team. -pandas can be installed via pip from -`PyPI `__. +.. _install.pip: -.. code-block:: shell - - pip install pandas +Installing with pip +~~~~~~~~~~~~~~~~~~~ -.. note:: - You must have ``pip>=19.3`` to install from PyPI. +For users working with the `pip `__ package manager, +pandas can be installed from `PyPI `__. -.. note:: +.. code-block:: shell - It is recommended to install and run pandas from a virtual environment, for example, - using the Python standard library's `venv `__ + pip install pandas pandas can also be installed with sets of optional dependencies to enable certain functionality. For example, to install pandas with the optional dependencies to read Excel files. @@ -98,25 +85,8 @@ to install pandas with the optional dependencies to read Excel files. The full list of extras that can be installed can be found in the :ref:`dependency section.` -Handling ImportErrors -~~~~~~~~~~~~~~~~~~~~~ - -If you encounter an ``ImportError``, it usually means that Python couldn't find pandas in the list of available -libraries. Python internally has a list of directories it searches through, to find packages. You can -obtain these directories with. - -.. code-block:: python - - import sys - sys.path - -One way you could be encountering this error is if you have multiple Python installations on your system -and you don't have pandas installed in the Python installation you're currently using. -In Linux/Mac you can run ``which python`` on your terminal and it will tell you which Python installation you're -using. If it's something like "/usr/bin/python", you're using the Python from the system, which is not recommended. - -It is highly recommended to use ``conda``, for quick installation and for package and dependency updates. -You can find simple installation instructions for pandas :ref:`in this document `. +Additionally, it is recommended to install and run pandas from a virtual environment, for example, +using the Python standard library's `venv `__ .. _install.source: @@ -144,49 +114,24 @@ index from the PyPI registry of anaconda.org. You can install it by running. pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas -Note that you might be required to uninstall an existing version of pandas to install the development version. +.. note:: + You might be required to uninstall an existing version of pandas to install the development version. -.. code-block:: shell + .. code-block:: shell - pip uninstall pandas -y + pip uninstall pandas -y Running the test suite ---------------------- -pandas is equipped with an exhaustive set of unit tests. The packages required to run the tests -can be installed with ``pip install "pandas[test]"``. To run the tests from a -Python terminal. - -.. code-block:: python - - >>> import pandas as pd - >>> pd.test() - running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.10/site-packages/pandas - - ============================= test session starts ============================== - platform linux -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 - rootdir: /home/user - plugins: dash-1.19.0, anyio-3.5.0, hypothesis-6.29.3 - collected 154975 items / 4 skipped / 154971 selected - ........................................................................ [ 0%] - ........................................................................ [ 99%] - ....................................... [100%] - - ==================================== ERRORS ==================================== - - =================================== FAILURES =================================== - - =============================== warnings summary =============================== - - =========================== short test summary info ============================ - - = 1 failed, 146194 passed, 7402 skipped, 1367 xfailed, 5 xpassed, 197 warnings, 10 errors in 1090.16s (0:18:10) = +If pandas has been installed :ref:`from source `, running ``pytest pandas`` will run all of pandas unit tests. +The unit tests can also be run from the pandas module itself with the :func:`test` function. The packages required to run the tests +can be installed with ``pip install "pandas[test]"``. .. note:: - This is just an example of what information is shown. Test failures are not necessarily indicative - of a broken pandas installation. + Test failures are not necessarily indicative of a broken pandas installation. .. _install.dependencies: @@ -219,7 +164,7 @@ For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while optional dependency is not installed, pandas will raise an ``ImportError`` when the method requiring that dependency is called. -If using pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml) +With pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml) as optional extras (e.g. ``pandas[performance, aws]``). All optional dependencies can be installed with ``pandas[all]``, and specific sets of dependencies are listed in the sections below. diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index 05729809491b5..024300bb8a9b0 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -271,7 +271,7 @@ Add the parameters' full description and name, provided by the parameters metada Compared to the previous example, there is no common column name. However, the ``parameter`` column in the ``air_quality`` table and the -``id`` column in the ``air_quality_parameters_name`` both provide the +``id`` column in the ``air_quality_parameters`` table both provide the measured variable in a common format. The ``left_on`` and ``right_on`` arguments are used here (instead of just ``on``) to make the link between the two tables. diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 887ffd5580a52..72bb93d21a99f 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -177,7 +177,7 @@ See the indexing documentation :ref:`Indexing and Selecting Data ` and Getitem (``[]``) ~~~~~~~~~~~~~~~~ -For a :class:`DataFrame`, passing a single label selects a columns and +For a :class:`DataFrame`, passing a single label selects a column and yields a :class:`Series` equivalent to ``df.A``: .. ipython:: python diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 9757a72f13fa8..b9c285ca30c96 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -87,8 +87,9 @@ index will be pulled out. **From scalar value** -If ``data`` is a scalar value, an index must be -provided. The value will be repeated to match the length of **index**. +If ``data`` is a scalar value, the value will be repeated to match +the length of **index**. If the **index** is not provided, it defaults +to ``RangeIndex(1)``. .. ipython:: python diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 2f7ec52d117f8..1dd6c5fabef04 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -32,6 +32,7 @@ Version 2.2 .. toctree:: :maxdepth: 2 + v2.2.3 v2.2.2 v2.2.1 v2.2.0 diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 72a2f84c4aaee..fbe5e9b4febb5 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -56,4 +56,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.2.1..v2.2.2|HEAD +.. contributors:: v2.2.1..v2.2.2 diff --git a/doc/source/whatsnew/v2.2.3.rst b/doc/source/whatsnew/v2.2.3.rst new file mode 100644 index 0000000000000..1696a7b6449af --- /dev/null +++ b/doc/source/whatsnew/v2.2.3.rst @@ -0,0 +1,45 @@ +.. _whatsnew_223: + +What's new in 2.2.3 (September 20, 2024) +---------------------------------------- + +These are the changes in pandas 2.2.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_220.py13_compat: + +Pandas 2.2.3 is now compatible with Python 3.13 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 2.2.3 is the first version of pandas that is generally compatible with the upcoming +Python 3.13, and both wheels for free-threaded and normal Python 3.13 will be uploaded for +this release. + +As usual please report any bugs discovered to our `issue tracker `_ + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`) +- Minor fixes for numpy 2.1 compatibility. (:issue:`59444`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.other: + +Other +~~~~~ +- Missing licenses for 3rd party dependencies were added back into the wheels. (:issue:`58632`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.2.2..v2.2.3|HEAD diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 03355f655eb28..01c2ed3821d7a 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -102,9 +102,11 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) +- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) - +- Interval ^^^^^^^^ diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index af61551156fbc..321005272817d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -54,7 +54,9 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) +- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) @@ -603,7 +605,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- +- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) - Datetimelike @@ -679,6 +681,8 @@ I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) +- Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) +- Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) @@ -687,8 +691,10 @@ I/O - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) +- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) +- Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) @@ -708,6 +714,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`) +- Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) @@ -736,6 +743,7 @@ Sparse ^^^^^^ - Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) - Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`) +- Bug in :meth:`DataFrame.sparse.to_dense` which ignored subclassing and always returned an instance of :class:`DataFrame` (:issue:`59913`) ExtensionArray ^^^^^^^^^^^^^^ @@ -752,8 +760,9 @@ Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) -- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) +- Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) +- Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h index e039991847a62..043805a8b25f4 100644 --- a/pandas/_libs/include/pandas/datetime/date_conversions.h +++ b/pandas/_libs/include/pandas/datetime/date_conversions.h @@ -9,6 +9,7 @@ The full license is in the LICENSE file, distributed with this software. #define PY_SSIZE_T_CLEAN #include + #include // Scales value inplace from nanosecond resolution to unit resolution diff --git a/pandas/_libs/include/pandas/parser/io.h b/pandas/_libs/include/pandas/parser/io.h index c707c23b567d2..41f1bb9312724 100644 --- a/pandas/_libs/include/pandas/parser/io.h +++ b/pandas/_libs/include/pandas/parser/io.h @@ -10,9 +10,10 @@ The full license is in the LICENSE file, distributed with this software. #pragma once #define PY_SSIZE_T_CLEAN -#include "tokenizer.h" #include +#include "tokenizer.h" + #define FS(source) ((file_source *)source) typedef struct _rd_source { diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h index 58a09ae1bba39..543839b5d75bf 100644 --- a/pandas/_libs/include/pandas/parser/pd_parser.h +++ b/pandas/_libs/include/pandas/parser/pd_parser.h @@ -13,9 +13,10 @@ extern "C" { #endif #define PY_SSIZE_T_CLEAN -#include "pandas/parser/tokenizer.h" #include +#include "pandas/parser/tokenizer.h" + typedef struct { int (*to_double)(char *, double *, char, char, int *); int (*floatify)(PyObject *, double *, int *); diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 2fa61642968cf..9706a8211b61f 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -3,6 +3,7 @@ #pragma once #include + #include #include diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1a2a0142c52e..de7d9af731010 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -600,6 +600,8 @@ def array_equivalent_object(ndarray left, ndarray right) -> bool: if not array_equivalent(x, y): return False + elif PyArray_Check(x) or PyArray_Check(y): + return False elif (x is C_NA) ^ (y is C_NA): return False elif not ( @@ -733,7 +735,9 @@ cpdef ndarray[object] ensure_string_array( convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to ensure that a new array is returned. + Whether to ensure that a new array is returned. When True, a new array + is always returned. When False, a new array is only returned when needed + to avoid mutating the input array. skipna : bool, default True Whether or not to coerce nulls to their stringified form (e.g. if False, NaN becomes 'nan'). @@ -750,7 +754,14 @@ cpdef ndarray[object] ensure_string_array( if hasattr(arr, "to_numpy"): - if hasattr(arr, "dtype") and arr.dtype.kind in "mM": + if ( + hasattr(arr, "dtype") + and arr.dtype.kind in "mM" + # TODO: we should add a custom ArrowExtensionArray.astype implementation + # that handles astype(str) specifically, avoiding ending up here and + # then we can remove the below check for `_pa_array` (for ArrowEA) + and not hasattr(arr, "_pa_array") + ): # dtype check to exclude DataFrame # GH#41409 TODO: not a great place for this out = arr.astype(str).astype(object) @@ -762,11 +773,15 @@ cpdef ndarray[object] ensure_string_array( result = np.asarray(arr, dtype="object") - if copy and (result is arr or np.shares_memory(arr, result)): - # GH#54654 - result = result.copy() - elif not copy and result is arr: - already_copied = False + if result is arr or np.may_share_memory(arr, result): + # if np.asarray(..) did not make a copy of the input arr, we still need + # to do that to avoid mutating the input array + # GH#54654: share_memory check is needed for rare cases where np.asarray + # returns a new object without making a copy of the actual data + if copy: + result = result.copy() + else: + already_copied = False elif not copy and not result.flags.writeable: # Weird edge case where result is a view already_copied = False @@ -1123,10 +1138,21 @@ def is_bool(obj: object) -> bool: """ Return True if given object is boolean. + Parameters + ---------- + obj : object + Object to check. + Returns ------- bool + See Also + -------- + api.types.is_scalar : Check if the input is a scalar. + api.types.is_integer : Check if the input is an integer. + api.types.is_float : Check if the input is a float. + Examples -------- >>> pd.api.types.is_bool(True) @@ -1142,10 +1168,22 @@ def is_complex(obj: object) -> bool: """ Return True if given object is complex. + Parameters + ---------- + obj : object + Object to check. + Returns ------- bool + See Also + -------- + api.types.is_complex_dtype: Check whether the provided array or + dtype is of a complex dtype. + api.types.is_number: Check if the object is a number. + api.types.is_integer: Return True if given object is integer. + Examples -------- >>> pd.api.types.is_complex(1 + 1j) @@ -1182,6 +1220,12 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: bool Whether `obj` has list-like properties. + See Also + -------- + Series : One-dimensional ndarray with axis labels (including time series). + Index : Immutable sequence used for indexing and alignment. + numpy.ndarray : Array object from NumPy, which is considered list-like. + Examples -------- >>> import datetime diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 2f44128cda822..390a527c22bbb 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -347,6 +347,14 @@ class NAType(C_NAType): The NA singleton is a missing value indicator defined by pandas. It is used in certain new extension dtypes (currently the "string" dtype). + See Also + -------- + numpy.nan : Floating point representation of Not a Number (NaN) for numerical data. + isna : Detect missing values for an array-like object. + notna : Detect non-missing values for an array-like object. + DataFrame.fillna : Fill missing values in a DataFrame. + Series.fillna : Fill missing values in a Series. + Examples -------- >>> pd.NA diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c index 7cc20a52f1849..ef6f1104a1fb9 100644 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c @@ -38,10 +38,11 @@ Numeric decoder derived from TCL library // Licence at LICENSES/ULTRAJSON_LICENSE -#include "pandas/vendored/ujson/lib/ultrajson.h" #define PY_SSIZE_T_CLEAN #include +#include "pandas/vendored/ujson/lib/ultrajson.h" + static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name, JSOBJ value) { int ret = PyDict_SetItem(obj, name, value); diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c index f369d122a3dbe..2ee084b9304f4 100644 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ b/pandas/_libs/src/vendored/ujson/python/ujson.c @@ -40,6 +40,7 @@ Numeric decoder derived from TCL library #define PY_SSIZE_T_CLEAN #include + #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY #include "numpy/arrayobject.h" diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 60afc1acdc297..1c0a99eb1ea25 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -493,6 +493,16 @@ class NaTType(_NaT): """ Total seconds in the duration. + This method calculates the total duration in seconds by combining + the days, seconds, and microseconds of the `Timedelta` object. + + See Also + -------- + to_timedelta : Convert argument to timedelta. + Timedelta : Represents a duration, the difference between two dates or times. + Timedelta.seconds : Returns the seconds component of the timedelta. + Timedelta.microseconds : Returns the microseconds component of the timedelta. + Examples -------- >>> td = pd.Timedelta('1min') diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 43240046c6500..3e5654b70cd92 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -89,7 +89,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None = *, + str format = *, bint exact = * ) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 61095b3f034fd..193556b2697a9 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -176,6 +176,15 @@ class OutOfBoundsDatetime(ValueError): """ Raised when the datetime is outside the range that can be represented. + This error occurs when attempting to convert or parse a datetime value + that exceeds the bounds supported by pandas' internal datetime + representation. + + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp : Pandas replacement for python ``datetime.datetime`` object. + Examples -------- >>> pd.to_datetime("08335394550") @@ -331,7 +340,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None=None, + str format=None, bint exact=True, ) except? -1: cdef: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 043c029ec900c..4db96fbaa3aad 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3316,6 +3316,11 @@ cdef class SemiMonthEnd(SemiMonthOffset): """ Two DateOffset's per month repeating on the last day of the month & day_of_month. + This offset allows for flexibility in generating date ranges or adjusting dates + to the end of a month or a specific day in the month, such as the 15th or the last + day of the month. It is useful for financial or scheduling applications where + events occur bi-monthly. + Attributes ---------- n : int, default 1 @@ -3325,6 +3330,13 @@ cdef class SemiMonthEnd(SemiMonthOffset): day_of_month : int, {1, 3,...,27}, default 15 A specific integer for the day of the month. + See Also + -------- + tseries.offsets.SemiMonthBegin : Offset for semi-monthly frequencies, starting at + the beginning of the month. + tseries.offsets.MonthEnd : Offset to the last calendar day of the month. + tseries.offsets.MonthBegin : Offset to the first calendar day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 14) @@ -3582,6 +3594,11 @@ cdef class WeekOfMonth(WeekOfMonthMixin): """ Describes monthly dates like "the Tuesday of the 2nd week of each month". + This offset allows for generating or adjusting dates by specifying + a particular week and weekday within a month. The week is zero-indexed, + where 0 corresponds to the first week of the month, and weekday follows + a Monday=0 convention. + Attributes ---------- n : int, default 1 @@ -3602,6 +3619,12 @@ cdef class WeekOfMonth(WeekOfMonthMixin): - 5 is Saturday - 6 is Sunday. + See Also + -------- + offsets.Week : Describes weekly frequency adjustments. + offsets.MonthEnd : Describes month-end frequency adjustments. + date_range : Generates a range of dates based on a specific frequency. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 4f90f26cf31ab..bbefea7c47fc3 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1189,6 +1189,16 @@ cdef class _Timedelta(timedelta): """ Total seconds in the duration. + This method calculates the total duration in seconds by combining + the days, seconds, and microseconds of the `Timedelta` object. + + See Also + -------- + to_timedelta : Convert argument to timedelta. + Timedelta : Represents a duration, the difference between two dates or times. + Timedelta.seconds : Returns the seconds component of the timedelta. + Timedelta.microseconds : Returns the microseconds component of the timedelta. + Examples -------- >>> td = pd.Timedelta('1min') @@ -1403,6 +1413,18 @@ cdef class _Timedelta(timedelta): """ Return a numpy.timedelta64 object with 'ns' precision. + Since NumPy uses ``timedelta64`` objects for its time operations, converting + a pandas ``Timedelta`` into a NumPy ``timedelta64`` provides seamless + integration between the two libraries, especially when working in environments + that heavily rely on NumPy for array-based calculations. + + See Also + -------- + to_timedelta : Convert argument to timedelta. + numpy.timedelta64 : A NumPy object for time duration. + Timedelta : Represents a duration, the difference between two dates + or times. + Examples -------- >>> td = pd.Timedelta('3D') @@ -1473,6 +1495,7 @@ cdef class _Timedelta(timedelta): See Also -------- + Timedelta.asm8 : Return a numpy timedelta64 array scalar view. numpy.ndarray.view : Returns a view of an array with the same data. Timedelta.to_numpy : Converts the Timedelta to a NumPy timedelta64. Timedelta.total_seconds : Returns the total duration of the Timedelta @@ -1493,6 +1516,17 @@ cdef class _Timedelta(timedelta): """ Return a components namedtuple-like. + Each component represents a different time unit, allowing you to access the + breakdown of the total duration in terms of days, hours, minutes, seconds, + milliseconds, microseconds, and nanoseconds. + + See Also + -------- + Timedelta.total_seconds : Returns the total duration of the Timedelta in + seconds. + to_timedelta : Convert argument to Timedelta. + Timedelta : Represents a duration, the difference between two dates or times. + Examples -------- >>> td = pd.Timedelta('2 day 4 min 3 us 42 ns') diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 5fa1a984b8aea..0be01da1816a2 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -108,7 +108,7 @@ COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] if using_string_dtype(): - STRING_DTYPES: list[Dtype] = [str, "U"] + STRING_DTYPES: list[Dtype] = ["U"] else: STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index bbd5e60a5a812..01c4dcd92ee40 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -701,6 +701,10 @@ def assert_extension_array_equal( """ Check that left and right ExtensionArrays are equal. + This method compares two ``ExtensionArray`` instances for equality, + including checks for missing values, the dtype of the arrays, and + the exactness of the comparison (or tolerance when comparing floats). + Parameters ---------- left, right : ExtensionArray @@ -726,6 +730,12 @@ def assert_extension_array_equal( .. versionadded:: 2.0.0 + See Also + -------- + testing.assert_series_equal : Check that left and right ``Series`` are equal. + testing.assert_frame_equal : Check that left and right ``DataFrame`` are equal. + testing.assert_index_equal : Check that left and right ``Index`` are equal. + Notes ----- Missing values are checked separately from valid values. diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index 91b5d2a981bef..4ca67d6fc082d 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -73,14 +73,15 @@ def set_timezone(tz: str) -> Generator[None, None, None]: import time def setTZ(tz) -> None: - if tz is None: - try: - del os.environ["TZ"] - except KeyError: - pass - else: - os.environ["TZ"] = tz - time.tzset() + if hasattr(time, "tzset"): + if tz is None: + try: + del os.environ["TZ"] + except KeyError: + pass + else: + os.environ["TZ"] = tz + time.tzset() orig_tz = os.environ.get("TZ") setTZ(tz) diff --git a/pandas/conftest.py b/pandas/conftest.py index d11213f1164bc..e2db9260ac37d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1272,6 +1272,34 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], +) +def string_dtype_no_object(request): + """ + Parametrized fixture for string dtypes. + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) + """ + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) + + @pytest.fixture( params=[ "string[python]", @@ -1310,7 +1338,13 @@ def string_storage(request): pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), ("python", np.nan), - ] + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], ) def string_dtype_arguments(request): """ @@ -1341,6 +1375,7 @@ def dtype_backend(request): # Alias so we can test with cartesian product of string_storage string_storage2 = string_storage +string_dtype_arguments2 = string_dtype_arguments @pytest.fixture(params=tm.BYTES_DTYPES) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index e6f0427de2a3a..413fdafc7fd04 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -53,7 +53,8 @@ @contextmanager def set_numba_data(index: Index): numba_data = index._data - if numba_data.dtype == object: + if numba_data.dtype in (object, "string"): + numba_data = np.asarray(numba_data) if not lib.is_string_array(numba_data): raise ValueError( "The numba engine only supports using string or numeric column names" diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 5959156d11123..1f13459724d78 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -38,10 +38,7 @@ is_numeric_dtype, is_sequence, ) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - ExtensionDtype, -) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCNDFrame, @@ -1172,12 +1169,7 @@ def apply_with_numba(self) -> dict[int, Any]: from pandas.core._numba.extensions import set_numba_data index = self.obj.index - if index.dtype == "string": - index = index.astype(object) - columns = self.obj.columns - if columns.dtype == "string": - columns = columns.astype(object) # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict @@ -1470,14 +1462,7 @@ def curried(x): else: curried = func - - # row-wise access - # apply doesn't have a `na_action` keyword and for backward compat reasons - # we need to give `na_action="ignore"` for categorical data. - # TODO: remove the `na_action="ignore"` when that default has been changed in - # Categorical (GH51645). - action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None - mapped = obj._map_values(mapper=curried, na_action=action) + mapped = obj._map_values(mapper=curried) if len(mapped) and isinstance(mapped[0], ABCSeries): # GH#43986 Need to do list(mapped) in order to get treated as nested diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 950d4cd7cc92e..aa5b28c71b12a 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import partial +import re from typing import ( TYPE_CHECKING, Any, @@ -11,6 +12,7 @@ from pandas.compat import ( pa_version_under10p1, + pa_version_under11p0, pa_version_under13p0, pa_version_under17p0, ) @@ -22,10 +24,7 @@ import pyarrow.compute as pc if TYPE_CHECKING: - from collections.abc import ( - Callable, - Sized, - ) + from collections.abc import Callable from pandas._typing import ( Scalar, @@ -34,7 +33,7 @@ class ArrowStringArrayMixin: - _pa_array: Sized + _pa_array: pa.ChunkedArray def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -50,6 +49,37 @@ def _convert_int_result(self, result): def _apply_elementwise(self, func: Callable) -> list[list[Any]]: raise NotImplementedError + def _str_len(self): + result = pc.utf8_length(self._pa_array) + return self._convert_int_result(result) + + def _str_lower(self) -> Self: + return type(self)(pc.utf8_lower(self._pa_array)) + + def _str_upper(self) -> Self: + return type(self)(pc.utf8_upper(self._pa_array)) + + def _str_strip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_trim_whitespace(self._pa_array) + else: + result = pc.utf8_trim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_lstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._pa_array) + else: + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_rstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._pa_array) + else: + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) + return type(self)(result) + def _str_pad( self, width: int, @@ -96,13 +126,29 @@ def _str_get(self, i: int) -> Self: selected = pc.utf8_slice_codeunits( self._pa_array, start=start, stop=stop, step=step ) - null_value = pa.scalar( - None, - type=self._pa_array.type, # type: ignore[attr-defined] - ) + null_value = pa.scalar(None, type=self._pa_array.type) result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ) -> Self: + if pa_version_under11p0: + # GH#59724 + result = self._apply_elementwise(lambda val: val[start:stop:step]) + return type(self)(pa.chunked_array(result, type=self._pa_array.type)) + if start is None: + if step is not None and step < 0: + # GH#59710 + start = -1 + else: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_slice_replace( self, start: int | None = None, stop: int | None = None, repl: str | None = None ) -> Self: @@ -114,6 +160,33 @@ def _str_slice_replace( stop = np.iinfo(np.int64).max return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ) -> Self: + if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + raise NotImplementedError( + "replace is not supported with a re.Pattern, callable repl, " + "case=False, or flags!=0" + ) + + func = pc.replace_substring_regex if regex else pc.replace_substring + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) + return type(self)(result) + def _str_capitalize(self) -> Self: return type(self)(pc.utf8_capitalize(self._pa_array)) @@ -123,6 +196,16 @@ def _str_title(self) -> Self: def _str_swapcase(self) -> Self: return type(self)(pc.utf8_swapcase(self._pa_array)) + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + predicate = lambda val: val.removeprefix(prefix) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + def _str_removesuffix(self, suffix: str): ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) @@ -214,6 +297,20 @@ def _str_contains( result = result.fill_null(na) return self._convert_bool_result(result) + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) + def _str_find(self, sub: str, start: int = 0, end: int | None = None): if ( pa_version_under13p0 diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index cbc9ce0252750..285c3fd465ffc 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -1,27 +1,8 @@ from __future__ import annotations -import warnings - import numpy as np import pyarrow -from pandas._config.config import get_option - -from pandas.errors import PerformanceWarning -from pandas.util._exceptions import find_stack_level - - -def fallback_performancewarning(version: str | None = None) -> None: - """ - Raise a PerformanceWarning for falling back to ExtensionArray's - non-pyarrow method - """ - if get_option("performance_warnings"): - msg = "Falling back on a non-pyarrow code path which may decrease performance." - if version is not None: - msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." - warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) - def pyarrow_array_to_numpy_and_mask( arr, dtype: np.dtype diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 15f9ba611a642..00d46ab9296d0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,6 +41,7 @@ is_list_like, is_numeric_dtype, is_scalar, + pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -1998,7 +1999,7 @@ def _rank( """ See Series.rank.__doc__. """ - return type(self)( + return self._convert_rank_result( self._rank_calc( axis=axis, method=method, @@ -2299,7 +2300,13 @@ def _groupby_op( ) if isinstance(result, np.ndarray): return result - return type(self)._from_sequence(result, copy=False) + elif isinstance(result, BaseMaskedArray): + pa_result = result.__arrow_array__() + return type(self)(pa_result) + else: + # DatetimeArray, TimedeltaArray + pa_result = pa.array(result, from_pandas=True) + return type(self)(pa_result) def _apply_elementwise(self, func: Callable) -> list[list[Any]]: """Apply a callable to each element while maintaining the chunking structure.""" @@ -2317,41 +2324,14 @@ def _convert_bool_result(self, result): def _convert_int_result(self, result): return type(self)(result) + def _convert_rank_result(self, result): + return type(self)(result) + def _str_count(self, pat: str, flags: int = 0) -> Self: if flags: raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _result_converter(self, result): - return type(self)(result) - - def _str_replace( - self, - pat: str | re.Pattern, - repl: str | Callable, - n: int = -1, - case: bool = True, - flags: int = 0, - regex: bool = True, - ) -> Self: - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - raise NotImplementedError( - "replace is not supported with a re.Pattern, callable repl, " - "case=False, or flags!=0" - ) - - func = pc.replace_substring_regex if regex else pc.replace_substring - # https://github.com/apache/arrow/issues/39149 - # GH 56404, unexpected behavior with negative max_replacements with pyarrow. - pa_max_replacements = None if n < 0 else n - result = func( - self._pa_array, - pattern=pat, - replacement=repl, - max_replacements=pa_max_replacements, - ) - return type(self)(result) - def _str_repeat(self, repeats: int | Sequence[int]) -> Self: if not isinstance(repeats, int): raise NotImplementedError( @@ -2359,20 +2339,6 @@ def _str_repeat(self, repeats: int | Sequence[int]) -> Self: ) return type(self)(pc.binary_repeat(self._pa_array, repeats)) - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ) -> Self: - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ) -> Self: - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - def _str_join(self, sep: str) -> Self: if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type @@ -2393,57 +2359,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ) -> Self: - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - - def _str_len(self) -> Self: - return type(self)(pc.utf8_length(self._pa_array)) - - def _str_lower(self) -> Self: - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self) -> Self: - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_removeprefix(self, prefix: str): - if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) - predicate = lambda val: val.removeprefix(prefix) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - def _str_casefold(self) -> Self: predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) @@ -2475,7 +2390,9 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): + if dtype is None: + dtype = np.bool_ split = pc.split_pattern(self._pa_array, sep) flattened_values = pc.list_flatten(split) uniques = flattened_values.unique() @@ -2485,7 +2402,15 @@ def _str_get_dummies(self, sep: str = "|"): n_cols = len(uniques) indices = pc.index_in(flattened_values, uniques_sorted).to_numpy() indices = indices + np.arange(n_rows).repeat(lengths) * n_cols - dummies = np.zeros(n_rows * n_cols, dtype=np.bool_) + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) + if dtype == str: + dummies[:] = False dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 536c7303a2f92..5f2c2a7772f78 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -999,16 +999,73 @@ def interpolate( **kwargs, ) -> Self: """ - See DataFrame.interpolate.__doc__. + Fill NaN values using an interpolation method. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. One of: + * 'linear': Ignore the index and treat the values as equally spaced. + This is the only method supported on MultiIndexes. + * 'time': Works on daily and higher resolution data to interpolate + given length of interval. + * 'index', 'values': use the actual numerical values of the index. + * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', + 'polynomial': Passed to scipy.interpolate.interp1d, whereas 'spline' + is passed to scipy.interpolate.UnivariateSpline. These methods use + the numerical values of the index. + Both 'polynomial' and 'spline' require that you also specify an + order (int), e.g. arr.interpolate(method='polynomial', order=5). + * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima', + 'cubicspline': Wrappers around the SciPy interpolation methods + of similar names. See Notes. + * 'from_derivatives': Refers to scipy.interpolate.BPoly.from_derivatives. + axis : int + Axis to interpolate along. For 1-dimensional data, use 0. + index : Index + Index to use for interpolation. + limit : int or None + Maximum number of consecutive NaNs to fill. Must be greater than 0. + limit_direction : {'forward', 'backward', 'both'} + Consecutive NaNs will be filled in this direction. + limit_area : {'inside', 'outside'} or None + If limit is specified, consecutive NaNs will be filled with this + restriction. + * None: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + copy : bool + If True, a copy of the object is returned with interpolated values. + **kwargs : optional + Keyword arguments to pass on to the interpolating function. + + Returns + ------- + ExtensionArray + An ExtensionArray with interpolated values. + + See Also + -------- + Series.interpolate : Interpolate values in a Series. + DataFrame.interpolate : Interpolate values in a DataFrame. + + Notes + ----- + - All parameters must be specified as keyword arguments. + - The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima' + methods are wrappers around the respective SciPy implementations of + similar names. These use the actual numerical values of the index. Examples -------- + Interpolating values in a NumPy array: + >>> arr = pd.arrays.NumpyExtensionArray(np.array([0, 1, np.nan, 3])) >>> arr.interpolate( ... method="linear", ... limit=3, ... limit_direction="forward", - ... index=pd.Index([1, 2, 3, 4]), + ... index=pd.Index(range(len(arr))), ... fill_value=1, ... copy=False, ... axis=0, @@ -1017,6 +1074,22 @@ def interpolate( [0.0, 1.0, 2.0, 3.0] Length: 4, dtype: float64 + + Interpolating values in a FloatingArray: + + >>> arr = pd.array([1.0, pd.NA, 3.0, 4.0, pd.NA, 6.0], dtype="Float64") + >>> arr.interpolate( + ... method="linear", + ... axis=0, + ... index=pd.Index(range(len(arr))), + ... limit=None, + ... limit_direction="both", + ... limit_area=None, + ... copy=True, + ... ) + + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + Length: 6, dtype: Float64 """ # NB: we return type(self) even if copy=False raise NotImplementedError( diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 74c0cd7719c13..53ebc35b68d14 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -286,6 +286,13 @@ class BooleanArray(BaseMaskedArray): ------- BooleanArray + See Also + -------- + array : Create an array from data with the appropriate dtype. + BooleanDtype : Extension dtype for boolean data. + Series : One-dimensional ndarray with axis labels (including time series). + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Examples -------- Create an BooleanArray with :func:`pandas.array`: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c613a345686cc..a69e197df851d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2681,11 +2681,13 @@ def _str_map( result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) return take_nd(result, codes, fill_value=na_value) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) + return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies( + sep, dtype + ) # ------------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index fbe1677b95b33..a25a698856747 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -471,10 +471,16 @@ def astype(self, dtype, copy: bool = True): return self._box_values(self.asi8.ravel()).reshape(self.shape) + elif is_string_dtype(dtype): + if isinstance(dtype, ExtensionDtype): + arr_object = self._format_native_types(na_rep=dtype.na_value) # type: ignore[arg-type] + cls = dtype.construct_array_type() + return cls._from_sequence(arr_object, dtype=dtype, copy=False) + else: + return self._format_native_types() + elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) - elif is_string_dtype(dtype): - return self._format_native_types() elif dtype.kind in "iu": # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. @@ -1387,7 +1393,7 @@ def __add__(self, other): if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(result) + return TimedeltaArray._from_sequence(result, dtype=result.dtype) return result def __radd__(self, other): @@ -1447,7 +1453,7 @@ def __sub__(self, other): if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(result) + return TimedeltaArray._from_sequence(result, dtype=result.dtype) return result def __rsub__(self, other): @@ -1466,7 +1472,7 @@ def __rsub__(self, other): # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray - other = DatetimeArray._from_sequence(other) + other = DatetimeArray._from_sequence(other, dtype=other.dtype) return other - self elif self.dtype.kind == "M" and hasattr(other, "dtype") and not other_is_dt64: # GH#19959 datetime - datetime is well-defined as timedelta, diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 201c449185057..43f4428118aa7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -818,11 +818,7 @@ def _add_offset(self, offset: BaseOffset) -> Self: stacklevel=find_stack_level(), ) res_values = self.astype("O") + offset - # TODO(GH#55564): as_unit will be unnecessary - result = type(self)._from_sequence(res_values).as_unit(self.unit) - if not len(self): - # GH#30336 _from_sequence won't be able to infer self.tz - return result.tz_localize(self.tz) + result = type(self)._from_sequence(res_values, dtype=self.dtype) else: result = type(self)._simple_new(res_values, dtype=res_values.dtype) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index b3fbf0f92c32d..67c23f4825a7f 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -96,6 +96,14 @@ class FloatingArray(NumericArray): ------- FloatingArray + See Also + -------- + array : Create an array. + Float32Dtype : Float32 dtype for FloatingArray. + Float64Dtype : Float64 dtype for FloatingArray. + Series : One-dimensional labeled array capable of holding data. + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Examples -------- Create an FloatingArray with :func:`pandas.array`: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 52d64162358c8..2ac9c77bef322 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1291,6 +1291,16 @@ def mid(self) -> Index: """ Return the midpoint of each Interval in the IntervalArray as an Index. + The midpoint of an interval is calculated as the average of its + ``left`` and ``right`` bounds. This property returns a ``pandas.Index`` object + containing the midpoint for each interval. + + See Also + -------- + Interval.left : Return left bound for the interval. + Interval.right : Return right bound for the interval. + Interval.length : Return the length of each interval. + Examples -------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index aa8dacbd6aad5..7d0ad74f851f0 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -812,7 +812,7 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: new_parr = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base) - dta = DatetimeArray._from_sequence(new_data) + dta = DatetimeArray._from_sequence(new_data, dtype=np.dtype("M8[ns]")) if self.freq.name == "B": # See if we can retain BDay instead of Day in cases where diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index e610e018c5a74..8083371ed171a 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -369,10 +369,10 @@ def to_dense(self) -> DataFrame: 1 1 2 0 """ - from pandas import DataFrame - data = {k: v.array.to_dense() for k, v in self._parent.items()} - return DataFrame(data, index=self._parent.index, columns=self._parent.columns) + return self._parent._constructor( + data, index=self._parent.index, columns=self._parent.columns + ) def to_coo(self) -> spmatrix: """ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index a09dc20af3b36..0c76280e7fdb4 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -603,6 +603,18 @@ def sp_values(self) -> np.ndarray: """ An ndarray containing the non- ``fill_value`` values. + This property returns the actual data values stored in the sparse + representation, excluding the values that are equal to the ``fill_value``. + The result is an ndarray of the underlying values, preserving the sparse + structure by omitting the default ``fill_value`` entries. + + See Also + -------- + Series.sparse.to_dense : Convert a Series from sparse values to dense. + Series.sparse.fill_value : Elements in `data` that are `fill_value` are + not stored. + Series.sparse.density : The percent of non- ``fill_value`` points, as decimal. + Examples -------- >>> from pandas.arrays import SparseArray @@ -623,6 +635,12 @@ def fill_value(self): For memory savings, this should be the most common value in the array. + See Also + -------- + SparseDtype : Dtype for data stored in :class:`SparseArray`. + Series.value_counts : Return a Series containing counts of unique values. + Series.fillna : Fill NA/NaN in a Series with a specified value. + Examples -------- >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]") @@ -690,6 +708,18 @@ def npoints(self) -> int: """ The number of non- ``fill_value`` points. + This property returns the number of elements in the sparse series that are + not equal to the ``fill_value``. Sparse data structures store only the + non-``fill_value`` elements, reducing memory usage when the majority of + values are the same. + + See Also + -------- + Series.sparse.to_dense : Convert a Series from sparse values to dense. + Series.sparse.fill_value : Elements in ``data`` that are ``fill_value`` are + not stored. + Series.sparse.density : The percent of non- ``fill_value`` points, as decimal. + Examples -------- >>> from pandas.arrays import SparseArray diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 88fd1481031f8..b3aa782341c77 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -46,6 +46,7 @@ nanops, ops, ) +from pandas.core.algorithms import isin from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import ( @@ -65,6 +66,7 @@ import pyarrow from pandas._typing import ( + ArrayLike, AxisInt, Dtype, DtypeObj, @@ -715,6 +717,10 @@ def __setitem__(self, key, value) -> None: else: if not is_array_like(value): value = np.asarray(value, dtype=object) + else: + # cast categories and friends to arrays to see if values are + # compatible, compatibility with arrow backed strings + value = np.asarray(value) if len(value) and not lib.is_string_array(value, skipna=True): raise TypeError("Must provide strings.") @@ -731,6 +737,24 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if isinstance(values, BaseStringArray) or ( + isinstance(values, ExtensionArray) and is_string_dtype(values.dtype) + ): + values = values.astype(self.dtype, copy=False) + else: + if not lib.is_string_array(np.asarray(values), skipna=True): + values = np.array( + [val for val in values if isinstance(val, str) or isna(val)], + dtype=object, + ) + if not len(values): + return np.zeros(self.shape, dtype=bool) + + values = self._from_sequence(values, dtype=self.dtype) + + return isin(np.asarray(self), np.asarray(values)) + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 97381b82ceab9..75bb1f8fb1a65 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -10,8 +10,6 @@ import numpy as np -from pandas._config.config import get_option - from pandas._libs import ( lib, missing as libmissing, @@ -31,6 +29,7 @@ from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import Float64Dtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype from pandas.core.arrays.string_ import ( @@ -43,8 +42,6 @@ import pyarrow as pa import pyarrow.compute as pc - from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - if TYPE_CHECKING: from collections.abc import ( @@ -54,9 +51,8 @@ from pandas._typing import ( ArrayLike, - AxisInt, Dtype, - Scalar, + NpDtype, Self, npt, ) @@ -240,7 +236,7 @@ def _maybe_convert_setitem_value(self, value): value[isna(value)] = None for v in value: if not (v is None or isinstance(v, str)): - raise TypeError("Scalar must be NA or str") + raise TypeError("Must provide strings") return super()._maybe_convert_setitem_value(value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: @@ -293,13 +289,26 @@ def astype(self, dtype, copy: bool = True): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad + _str_match = ArrowStringArrayMixin._str_match + _str_fullmatch = ArrowStringArrayMixin._str_fullmatch + _str_lower = ArrowStringArrayMixin._str_lower + _str_upper = ArrowStringArrayMixin._str_upper + _str_strip = ArrowStringArrayMixin._str_strip + _str_lstrip = ArrowStringArrayMixin._str_lstrip + _str_rstrip = ArrowStringArrayMixin._str_rstrip + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_get = ArrowStringArrayMixin._str_get + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace + _str_len = ArrowStringArrayMixin._str_len + _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True ): if flags: - if get_option("mode.performance_warnings"): - fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) if not isna(na): @@ -325,90 +334,23 @@ def _str_replace( regex: bool = True, ): if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - if get_option("mode.performance_warnings"): - fallback_performancewarning() return super()._str_replace(pat, repl, n, case, flags, regex) - return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) + return ArrowStringArrayMixin._str_replace( + self, pat, repl, n, case, flags, regex + ) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): return super()._str_repeat(repeats) else: - return type(self)(pc.binary_repeat(self._pa_array, repeats)) - - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ) -> Self: - if stop is None: - return super()._str_slice(start, stop, step) - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - - def _str_len(self): - result = pc.utf8_length(self._pa_array) - return self._convert_int_result(result) - - def _str_lower(self) -> Self: - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self) -> Self: - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) + return ArrowExtensionArray._str_repeat(self, repeats=repeats) def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) + return ArrowStringArrayMixin._str_removeprefix(self, prefix) return super()._str_removeprefix(prefix) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) @@ -425,12 +367,22 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return super()._str_find(sub, start, end) return ArrowStringArrayMixin._str_find(self, sub, start, end) - def _str_get_dummies(self, sep: str = "|"): - dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): + if dtype is None: + dtype = np.int64 + dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies( + sep, dtype + ) if len(labels) == 0: - return np.empty(shape=(0, 0), dtype=np.int64), labels + return np.empty(shape=(0, 0), dtype=dtype), labels dummies = np.vstack(dummies_pa.to_numpy()) - return dummies.astype(np.int64, copy=False), labels + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + return dummies.astype(dummies_dtype, copy=False), labels def _convert_int_result(self, result): if self.dtype.na_value is np.nan: @@ -444,6 +396,16 @@ def _convert_int_result(self, result): return Int64Dtype().__from_arrow__(result) + def _convert_rank_result(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + return result.astype("float64", copy=False) + + return Float64Dtype().__from_arrow__(result) + def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): @@ -465,28 +427,6 @@ def _reduce( else: return result - def _rank( - self, - *, - axis: AxisInt = 0, - method: str = "average", - na_option: str = "keep", - ascending: bool = True, - pct: bool = False, - ): - """ - See Series.rank.__doc__. - """ - return self._convert_int_result( - self._rank_calc( - axis=axis, - method=method, - na_option=na_option, - ascending=ascending, - pct=pct, - ) - ) - def value_counts(self, dropna: bool = True) -> Series: result = super().value_counts(dropna=dropna) if self.dtype.na_value is np.nan: @@ -508,9 +448,3 @@ def _cmp_method(self, other, op): class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan - _str_get = ArrowStringArrayMixin._str_get - _str_removesuffix = ArrowStringArrayMixin._str_removesuffix - _str_capitalize = ArrowStringArrayMixin._str_capitalize - _str_title = ArrowStringArrayMixin._str_title - _str_swapcase = ArrowStringArrayMixin._str_swapcase - _str_slice_replace = ArrowStringArrayMixin._str_slice_replace diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index c8a86ffc187d0..a8a0037d0bbb9 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -790,6 +790,19 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: Returns ------- numpy.ndarray + A NumPy ``timedelta64`` object representing the same duration as the + original pandas ``Timedelta`` object. The precision of the resulting + object is in nanoseconds, which is the default + time resolution used by pandas for ``Timedelta`` objects, ensuring + high precision for time-based calculations. + + See Also + -------- + to_timedelta : Convert argument to timedelta format. + Timedelta : Represents a duration between two dates or times. + DatetimeIndex: Index of datetime64 data. + Timedelta.components : Return a components namedtuple-like + of a single timedelta. Examples -------- @@ -800,6 +813,14 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: >>> tdelta_idx.to_pytimedelta() array([datetime.timedelta(days=1), datetime.timedelta(days=2), datetime.timedelta(days=3)], dtype=object) + + >>> tidx = pd.TimedeltaIndex(data=["1 days 02:30:45", "3 days 04:15:10"]) + >>> tidx + TimedeltaIndex(['1 days 02:30:45', '3 days 04:15:10'], + dtype='timedelta64[ns]', freq=None) + >>> tidx.to_pytimedelta() + array([datetime.timedelta(days=1, seconds=9045), + datetime.timedelta(days=3, seconds=15310)], dtype=object) """ return ints_to_pytimedelta(self._ndarray) @@ -842,6 +863,11 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: seconds_docstring = textwrap.dedent( """Number of seconds (>= 0 and less than 1 day) for each element. + See Also + -------- + Series.dt.seconds : Return number of seconds for each element. + Series.dt.nanoseconds : Return number of nanoseconds for each element. + Examples -------- For Series: @@ -917,6 +943,11 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: nanoseconds_docstring = textwrap.dedent( """Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. + See Also + -------- + Series.dt.seconds : Return number of seconds for each element. + Series.dt.microseconds : Return number of nanoseconds for each element. + Examples -------- For Series: diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 7de4d8cdf99e1..6158c4f4d0539 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -213,7 +213,7 @@ def reconstruct_object(typ, obj, axes, dtype, name): if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: ret_value = res_t.type(obj) else: - ret_value = typ(obj).astype(res_t) + ret_value = res_t.type(obj) # The condition is to distinguish 0-dim array (returned in case of # scalar) and 1 element array # e.g. np.array(0) and np.array([0]) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index aad768d31483a..4ccfbd71d9ce8 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -14,7 +14,10 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.common import ( + is_extension_array_dtype, + is_string_dtype, +) from pandas.core.computation.engines import ENGINES from pandas.core.computation.expr import ( @@ -185,15 +188,6 @@ def eval( """ Evaluate a Python expression as a string using various backends. - The following arithmetic operations are supported: ``+``, ``-``, ``*``, - ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following - boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). - Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, - :keyword:`or`, and :keyword:`not` with the same semantics as the - corresponding bitwise operators. :class:`~pandas.Series` and - :class:`~pandas.DataFrame` objects are supported and behave as they would - with plain ol' Python evaluation. - .. warning:: ``eval`` can run arbitrary code which can make you vulnerable to code @@ -207,6 +201,34 @@ def eval( `__, only Python `expressions `__. + + By default, with the numexpr engine, the following operations are supported: + + - Arthimetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%`` + - Boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not) + - Comparison operators: ``<``, ``<=``, ``==``, ``!=``, ``>=``, ``>`` + + Furthermore, the following mathematical functions are supported: + + - Trigonometric: ``sin``, ``cos``, ``tan``, ``arcsin``, ``arccos``, \ + ``arctan``, ``arctan2``, ``sinh``, ``cosh``, ``tanh``, ``arcsinh``, \ + ``arccosh`` and ``arctanh`` + - Logarithms: ``log`` natural, ``log10`` base 10, ``log1p`` log(1+x) + - Absolute Value ``abs`` + - Square root ``sqrt`` + - Exponential ``exp`` and Exponential minus one ``expm1`` + + See the numexpr engine `documentation + `__ + for further function support details. + + Using the ``'python'`` engine allows the use of native Python operators + such as floor division ``//``, in addition to built-in and user-defined + Python functions. + + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. parser : {'pandas', 'python'}, default 'pandas' The parser to use to construct the syntax tree from the expression. The default of ``'pandas'`` parses code slightly different than standard @@ -345,10 +367,13 @@ def eval( parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) if engine == "numexpr" and ( - is_extension_array_dtype(parsed_expr.terms.return_type) + ( + is_extension_array_dtype(parsed_expr.terms.return_type) + and not is_string_dtype(parsed_expr.terms.return_type) + ) or getattr(parsed_expr.terms, "operand_types", None) is not None and any( - is_extension_array_dtype(elem) + (is_extension_array_dtype(elem) and not is_string_dtype(elem)) for elem in parsed_expr.terms.operand_types ) ): diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index b074e768e0842..f45bc453d2541 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -21,6 +21,8 @@ from pandas.errors import UndefinedVariableError +from pandas.core.dtypes.common import is_string_dtype + import pandas.core.common as com from pandas.core.computation.ops import ( ARITH_OPS_SYMS, @@ -524,10 +526,12 @@ def _maybe_evaluate_binop( elif self.engine != "pytables": if ( getattr(lhs, "return_type", None) == object + or is_string_dtype(getattr(lhs, "return_type", None)) or getattr(rhs, "return_type", None) == object + or is_string_dtype(getattr(rhs, "return_type", None)) ): # evaluate "==" and "!=" in python if either of our operands - # has an object return type + # has an object or string return type return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) return res diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 665eb75953078..1e1292f8ef089 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -358,7 +358,8 @@ def array( return cls._from_sequence(data, dtype=dtype, copy=copy) elif data.dtype.kind in "iu": - return IntegerArray._from_sequence(data, copy=copy) + dtype = IntegerArray._dtype_cls._get_dtype_mapping()[data.dtype] + return IntegerArray._from_sequence(data, dtype=dtype, copy=copy) elif data.dtype.kind == "f": # GH#44715 Exclude np.float16 bc FloatingArray does not support it; # we will fall back to NumpyExtensionArray. @@ -366,7 +367,8 @@ def array( return NumpyExtensionArray._from_sequence( data, dtype=data.dtype, copy=copy ) - return FloatingArray._from_sequence(data, copy=copy) + dtype = FloatingArray._dtype_cls._get_dtype_mapping()[data.dtype] + return FloatingArray._from_sequence(data, dtype=dtype, copy=copy) elif data.dtype.kind == "b": return BooleanArray._from_sequence(data, dtype="boolean", copy=copy) @@ -611,7 +613,10 @@ def sanitize_array( dtype = StringDtype(na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) - if subarr is data and copy: + if ( + subarr is data + or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr] + ) and copy: subarr = subarr.copy() else: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index bcf1ade9b0320..98c770ec4a8b0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( Interval, Period, @@ -139,6 +141,11 @@ def is_object_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the object dtype. + This method examines the input to determine if it is of the + object data type. Object dtype is a generic data type that can + hold any Python objects, including strings, lists, and custom + objects. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -149,6 +156,15 @@ def is_object_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the object dtype. + See Also + -------- + api.types.is_numeric_dtype : Check whether the provided array or dtype is of a + numeric dtype. + api.types.is_string_dtype : Check whether the provided array or dtype is of + the string dtype. + api.types.is_bool_dtype : Check whether the provided array or dtype is of a + boolean dtype. + Examples -------- >>> from pandas.api.types import is_object_dtype @@ -279,6 +295,13 @@ def is_datetime64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the datetime64 dtype. + See Also + -------- + api.types.is_datetime64_ns_dtype: Check whether the provided array or + dtype is of the datetime64[ns] dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_dtype @@ -316,6 +339,13 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of a DatetimeTZDtype dtype. + See Also + -------- + api.types.is_datetime64_dtype: Check whether an array-like or + dtype is of the datetime64 dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64tz_dtype @@ -465,6 +495,15 @@ def is_interval_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the Interval dtype. + See Also + -------- + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + api.types.is_numeric_dtype : Check whether the provided array or dtype is + of a numeric dtype. + api.types.is_categorical_dtype : Check whether an array-like or dtype is of + the Categorical dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_interval_dtype @@ -514,6 +553,12 @@ def is_categorical_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the Categorical dtype. + See Also + -------- + api.types.is_list_like: Check if the object is list-like. + api.types.is_complex_dtype: Check whether the provided array or + dtype is of a complex dtype. + Examples -------- >>> from pandas.api.types import is_categorical_dtype @@ -674,6 +719,15 @@ def is_integer_dtype(arr_or_dtype) -> bool: Whether or not the array or dtype is of an integer dtype and not an instance of timedelta64. + See Also + -------- + api.types.is_integer : Return True if given object is integer. + api.types.is_numeric_dtype : Check whether the provided array or dtype is of a + numeric dtype. + api.types.is_float_dtype : Check whether the provided array or dtype is of a + float dtype. + Int64Dtype : An ExtensionDtype for Int64Dtype integer data. + Examples -------- >>> from pandas.api.types import is_integer_dtype @@ -857,6 +911,16 @@ def is_int64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of the int64 dtype. + See Also + -------- + api.types.is_float_dtype : Check whether the provided array or dtype is of a + float dtype. + api.types.is_bool_dtype : Check whether the provided array or dtype is of a + boolean dtype. + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + numpy.int64 : Numpy's 64-bit integer type. + Notes ----- Depending on system architecture, the return value of `is_int64_dtype( @@ -977,6 +1041,13 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: bool Whether or not the array or dtype is of the datetime64[ns] dtype. + See Also + -------- + api.types.is_datetime64_dtype: Check whether an array-like or + dtype is of the datetime64 dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_ns_dtype @@ -1239,6 +1310,9 @@ def is_float_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a float dtype. + The function checks for floating-point data types, which represent real numbers + that may have fractional components. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -1249,6 +1323,15 @@ def is_float_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a float dtype. + See Also + -------- + api.types.is_numeric_dtype : Check whether the provided array or dtype is of + a numeric dtype. + api.types.is_integer_dtype : Check whether the provided array or dtype is of + an integer dtype. + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + Examples -------- >>> from pandas.api.types import is_float_dtype @@ -1374,6 +1457,10 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: bool Whether the `arr_or_dtype` is an extension array type. + See Also + -------- + api.extensions.ExtensionArray : Abstract base class for pandas extension arrays. + Notes ----- This checks whether an object implements the pandas extension @@ -1408,7 +1495,15 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: elif isinstance(dtype, np.dtype): return False else: - return registry.find(dtype) is not None + try: + with warnings.catch_warnings(): + # pandas_dtype(..) can raise UserWarning for class input + warnings.simplefilter("ignore", UserWarning) + dtype = pandas_dtype(dtype) + except (TypeError, ValueError): + # np.dtype(..) can raise ValueError + return False + return isinstance(dtype, ExtensionDtype) def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool: @@ -1436,6 +1531,14 @@ def is_complex_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a complex dtype. + See Also + -------- + api.types.is_complex: Return True if given object is complex. + api.types.is_numeric_dtype: Check whether the provided array or + dtype is of a numeric dtype. + api.types.is_integer_dtype: Check whether the provided array or + dtype is of an integer dtype. + Examples -------- >>> from pandas.api.types import is_complex_dtype @@ -1703,6 +1806,12 @@ def pandas_dtype(dtype) -> DtypeObj: elif isinstance(dtype, (np.dtype, ExtensionDtype)): return dtype + # builtin aliases + if dtype is str and using_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + return StringDtype(na_value=np.nan) + # registered extension types result = registry.find(dtype) if result is not None: diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 68b4807961d19..bb6610c514375 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1065,6 +1065,20 @@ def freq(self) -> BaseOffset: """ The frequency object of this PeriodDtype. + The `freq` property returns the `BaseOffset` object that represents the + frequency of the PeriodDtype. This frequency specifies the interval (e.g., + daily, monthly, yearly) associated with the Period type. It is essential + for operations that depend on time-based calculations within a period index + or series. + + See Also + -------- + Period : Represents a period of time. + PeriodIndex : Immutable ndarray holding ordinal values indicating + regular periods. + PeriodDtype : An ExtensionDtype for Period data. + date_range : Return a fixed frequency range of dates. + Examples -------- >>> dtype = pd.PeriodDtype(freq="D") diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index f042911b53d2b..6adb34ff0f777 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -113,13 +113,24 @@ def is_file_like(obj: object) -> bool: Parameters ---------- - obj : The object to check + obj : object + The object to check for file-like properties. + This can be any Python object, and the function will + check if it has attributes typically associated with + file-like objects (e.g., `read`, `write`, `__iter__`). Returns ------- bool Whether `obj` has file-like properties. + See Also + -------- + api.types.is_dict_like : Check if the object is dict-like. + api.types.is_hashable : Return True if hash(obj) will succeed, False otherwise. + api.types.is_named_tuple : Check if the object is a named tuple. + api.types.is_iterator : Check if the object is an iterator. + Examples -------- >>> import io @@ -142,13 +153,24 @@ def is_re(obj: object) -> TypeGuard[Pattern]: Parameters ---------- - obj : The object to check + obj : object + The object to check for being a regex pattern. Typically, + this would be an object that you expect to be a compiled + pattern from the `re` module. Returns ------- bool Whether `obj` is a regex pattern. + See Also + -------- + api.types.is_float : Return True if given object is float. + api.types.is_iterator : Check if the object is an iterator. + api.types.is_integer : Return True if given object is integer. + api.types.is_re_compilable : Check if the object can be compiled + into a regex pattern instance. + Examples -------- >>> from pandas.api.types import is_re @@ -275,13 +297,22 @@ def is_dict_like(obj: object) -> bool: Parameters ---------- - obj : The object to check + obj : object + The object to check. This can be any Python object, + and the function will determine whether it + behaves like a dictionary. Returns ------- bool Whether `obj` has dict-like properties. + See Also + -------- + api.types.is_list_like : Check if the object is list-like. + api.types.is_file_like : Check if the object is a file-like. + api.types.is_named_tuple : Check if the object is a named tuple. + Examples -------- >>> from pandas.api.types import is_dict_like @@ -308,13 +339,22 @@ def is_named_tuple(obj: object) -> bool: Parameters ---------- - obj : The object to check + obj : object + The object that will be checked to determine + whether it is a named tuple. Returns ------- bool Whether `obj` is a named tuple. + See Also + -------- + api.types.is_dict_like: Check if the object is dict-like. + api.types.is_hashable: Return True if hash(obj) + will succeed, False otherwise. + api.types.is_categorical_dtype : Check if the dtype is categorical. + Examples -------- >>> from collections import namedtuple @@ -340,9 +380,24 @@ def is_hashable(obj: object) -> TypeGuard[Hashable]: Distinguish between these and other types by trying the call to hash() and seeing if they raise TypeError. + Parameters + ---------- + obj : object + The object to check for hashability. Any Python object can be passed here. + Returns ------- bool + True if object can be hashed (i.e., does not raise TypeError when + passed to hash()), and False otherwise (e.g., if object is mutable + like a list or dictionary). + + See Also + -------- + api.types.is_float : Return True if given object is float. + api.types.is_iterator : Check if the object is an iterator. + api.types.is_list_like : Check if the object is list-like. + api.types.is_dict_like : Check if the object is dict-like. Examples -------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a93a212e7eabd..1b47002e72fc6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1192,6 +1192,7 @@ def _repr_html_(self) -> str | None: min_rows = get_option("display.min_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") + show_floats = get_option("display.float_format") formatter = fmt.DataFrameFormatter( self, @@ -1199,7 +1200,7 @@ def _repr_html_(self) -> str | None: col_space=None, na_rep="NaN", formatters=None, - float_format=None, + float_format=show_floats, sparsify=None, justify=None, index_names=True, @@ -4479,20 +4480,11 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No expr : str The query string to evaluate. - You can refer to variables - in the environment by prefixing them with an '@' character like - ``@a + b``. - - You can refer to column names that are not valid Python variable names - by surrounding them in backticks. Thus, column names containing spaces - or punctuation (besides underscores) or starting with digits must be - surrounded by backticks. (For example, a column named "Area (cm^2)" would - be referenced as ```Area (cm^2)```). Column names which are Python keywords - (like "if", "for", "import", etc) cannot be used. - - For example, if one of your columns is called ``a a`` and you want - to sum it with ``b``, your query should be ```a a` + b``. + See the documentation for :func:`eval` for details of + supported operations and functions in the query string. + See the documentation for :meth:`DataFrame.eval` for details on + referring to column names and variables in the query string. inplace : bool Whether to modify the DataFrame rather than creating a new one. **kwargs @@ -4651,8 +4643,18 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: in the environment by prefixing them with an '@' character like ``@a + b``. - You can refer to column names that are not valid Python variable - names by surrounding them with backticks `````. + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuation (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "if", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + See the documentation for :func:`eval` for full details of + supported operations and functions in the expression string. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, @@ -4660,7 +4662,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by - :meth:`~pandas.DataFrame.query`. + :meth:`~pandas.DataFrame.eval`. Returns ------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35c4433dae7c5..f076f8d79f104 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -67,8 +67,6 @@ from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, - _agg_template_frame, - _agg_template_series, _transform_template, ) from pandas.core.indexes.api import ( @@ -324,8 +322,141 @@ def apply(self, func, *args, **kwargs) -> Series: """ return super().apply(func, *args, **kwargs) - @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + """ + Aggregate using one or more operations. + + The ``aggregate`` method enables flexible and efficient aggregation of grouped + data using a variety of functions, including built-in, user-defined, and + optimized JIT-compiled functions. + + Parameters + ---------- + func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a Series or when passed to Series.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - None, in which case ``**kwargs`` are used with Named Aggregation. Here + the output has one column for each element in ``**kwargs``. The name of + the column is keyword, whereas the value determines the aggregation + used to compute the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + .. deprecated:: 2.1.0 + + Passing a dictionary is deprecated and will raise in a future version + of pandas. Pass a list of aggregations instead. + *args + Positional arguments to pass to func. + engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + + **kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + + Returns + ------- + Series + Aggregated Series based on the grouping and the applied aggregation + functions. + + See Also + -------- + SeriesGroupBy.apply : Apply function func group-wise + and combine the results together. + SeriesGroupBy.transform : Transforms the Series on each group + based on the given function. + Series.aggregate : Aggregate using one or more operations. + + Notes + ----- + When using ``engine='numba'``, there will be no "fall back" behavior internally. + The group data and group index will be passed as numpy arrays to the JITed + user defined function, and no alternative execution attempts will be tried. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).min() + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg("min") + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg(["min", "max"]) + min max + 1 1 2 + 2 3 4 + + The output column names can be controlled by passing + the desired column names and aggregations as keyword arguments. + + >>> s.groupby([1, 1, 2, 2]).agg( + ... minimum="min", + ... maximum="max", + ... ) + minimum maximum + 1 1 2 + 2 3 4 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating + function. + + >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) + 1 1.0 + 2 3.0 + dtype: float64 + """ relabeling = func is None columns = None if relabeling: @@ -600,15 +731,24 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): ---------- func : function Criterion to apply to each group. Should return True or False. - dropna : bool + dropna : bool, optional Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. + *args : tuple + Optional positional arguments to pass to `func`. + **kwargs : dict + Optional keyword arguments to pass to `func`. Returns ------- Series The filtered subset of the original Series. + See Also + -------- + Series.filter: Filter elements of ungrouped Series. + DataFrameGroupBy.filter : Filter elements from groups base on criterion. + Notes ----- Functions that mutate the passed object can produce unexpected @@ -1506,8 +1646,181 @@ class DataFrameGroupBy(GroupBy[DataFrame]): """ ) - @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + """ + Aggregate using one or more operations. + + The ``aggregate`` function allows the application of one or more aggregation + operations on groups of data within a DataFrameGroupBy object. It supports + various aggregation methods, including user-defined functions and predefined + functions such as 'sum', 'mean', etc. + + Parameters + ---------- + func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of index labels -> functions, function names or list of such. + - None, in which case ``**kwargs`` are used with Named Aggregation. Here the + output has one column for each element in ``**kwargs``. The name of the + column is keyword, whereas the value determines the aggregation used to + compute the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + *args + Positional arguments to pass to func. + engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + + **kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + + Returns + ------- + DataFrame + Aggregated DataFrame based on the grouping and the applied aggregation + functions. + + See Also + -------- + DataFrame.groupby.apply : Apply function func group-wise + and combine the results together. + DataFrame.groupby.transform : Transforms the Series on each group + based on the given function. + DataFrame.aggregate : Aggregate using one or more operations. + + Notes + ----- + When using ``engine='numba'``, there will be no "fall back" behavior internally. + The group data and group index will be passed as numpy arrays to the JITed + user defined function, and no alternative execution attempts will be tried. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Examples + -------- + >>> data = { + ... "A": [1, 1, 2, 2], + ... "B": [1, 2, 3, 4], + ... "C": [0.362838, 0.227877, 1.267767, -0.562860], + ... } + >>> df = pd.DataFrame(data) + >>> df + A B C + 0 1 1 0.362838 + 1 1 2 0.227877 + 2 2 3 1.267767 + 3 2 4 -0.562860 + + The aggregation is for each column. + + >>> df.groupby("A").agg("min") + B C + A + 1 1 0.227877 + 2 3 -0.562860 + + Multiple aggregations + + >>> df.groupby("A").agg(["min", "max"]) + B C + min max min max + A + 1 1 2 0.227877 0.362838 + 2 3 4 -0.562860 1.267767 + + Select a column for aggregation + + >>> df.groupby("A").B.agg(["min", "max"]) + min max + A + 1 1 2 + 2 3 4 + + User-defined function for aggregation + + >>> df.groupby("A").agg(lambda x: sum(x) + 2) + B C + A + 1 5 2.590715 + 2 9 2.704907 + + Different aggregations per column + + >>> df.groupby("A").agg({"B": ["min", "max"], "C": "sum"}) + B C + min max sum + A + 1 1 2 0.590715 + 2 3 4 0.704907 + + To control the output names with different aggregations per column, + pandas supports "named aggregation" + + >>> df.groupby("A").agg( + ... b_min=pd.NamedAgg(column="B", aggfunc="min"), + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"), + ... ) + b_min c_sum + A + 1 1 0.590715 + 2 3 0.704907 + + - The keywords are the *output* column names + - The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. + Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields + ``['column', 'aggfunc']`` to make it clearer what the arguments are. + As usual, the aggregation can be a callable or a string alias. + + See :ref:`groupby.aggregate.named` for more. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating + function. + + >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) + B + A + 1 1.0 + 2 3.0 + """ relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) @@ -1943,9 +2256,9 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: dropna : bool Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. - *args + *args : tuple Additional positional arguments to pass to `func`. - **kwargs + **kwargs : dict Additional keyword arguments to pass to `func`. Returns @@ -1953,6 +2266,11 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: DataFrame The filtered subset of the original DataFrame. + See Also + -------- + DataFrame.filter: Filter elements of ungrouped DataFrame. + SeriesGroupBy.filter : Filter elements from groups base on criterion. + Notes ----- Each subframe is endowed the attribute 'name' in case you need to know @@ -2686,7 +3004,9 @@ def hist( Returns ------- - matplotlib.Axes or numpy.ndarray of them + matplotlib.Axes or numpy.ndarray + A ``matplotlib.Axes`` object or an array of ``Axes`` objects, depending on + the layout and grouping. See Also -------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9ec25858a455a..68314567d1b5e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -199,6 +199,15 @@ class providing the base-class of operations. Series or DataFrame Computed {fname} of values within each group. +See Also +-------- +SeriesGroupBy.min : Return the min of the group values. +DataFrameGroupBy.min : Return the min of the group values. +SeriesGroupBy.max : Return the max of the group values. +DataFrameGroupBy.max : Return the max of the group values. +SeriesGroupBy.sum : Return the sum of the group values. +DataFrameGroupBy.sum : Return the sum of the group values. + Examples -------- {example} @@ -355,165 +364,6 @@ class providing the base-class of operations. -------- %(example)s""" -_agg_template_series = """ -Aggregate using one or more operations. - -Parameters ----------- -func : function, str, list, dict or None - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - None, in which case ``**kwargs`` are used with Named Aggregation. Here the - output has one column for each element in ``**kwargs``. The name of the - column is keyword, whereas the value determines the aggregation used to compute - the values in the column. - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - - .. deprecated:: 2.1.0 - - Passing a dictionary is deprecated and will raise in a future version - of pandas. Pass a list of aggregations instead. -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - -**kwargs - * If ``func`` is None, ``**kwargs`` are used to define the output names and - aggregations via Named Aggregation. See ``func`` entry. - * Otherwise, keyword arguments to be passed into func. - -Returns -------- -{klass} - -See Also --------- -{klass}GroupBy.apply : Apply function func group-wise - and combine the results together. -{klass}GroupBy.transform : Transforms the Series on each group - based on the given function. -{klass}.aggregate : Aggregate using one or more operations. - -Notes ------ -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` -for more details. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. -{examples}""" - -_agg_template_frame = """ -Aggregate using one or more operations. - -Parameters ----------- -func : function, str, list, dict or None - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - dict of index labels -> functions, function names or list of such. - - None, in which case ``**kwargs`` are used with Named Aggregation. Here the - output has one column for each element in ``**kwargs``. The name of the - column is keyword, whereas the value determines the aggregation used to compute - the values in the column. - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - -**kwargs - * If ``func`` is None, ``**kwargs`` are used to define the output names and - aggregations via Named Aggregation. See ``func`` entry. - * Otherwise, keyword arguments to be passed into func. - -Returns -------- -{klass} - -See Also --------- -{klass}.groupby.apply : Apply function func group-wise - and combine the results together. -{klass}.groupby.transform : Transforms the Series on each group - based on the given function. -{klass}.aggregate : Aggregate using one or more operations. - -Notes ------ -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` -for more details. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. -{examples}""" - @final class GroupByPlot(PandasObject): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 04bc9db302af8..a82e77140d274 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -371,6 +371,10 @@ def _call_cython_op( is_datetimelike = dtype.kind in "mM" + if self.how in ["any", "all"]: + if mask is None: + mask = isna(values) + if is_datetimelike: values = values.view("int64") is_numeric = True @@ -380,12 +384,10 @@ def _call_cython_op( values = values.astype(np.float32) if self.how in ["any", "all"]: - if mask is None: - mask = isna(values) if dtype == object: if kwargs["skipna"]: # GH#37501: don't raise on pd.NA when skipna=True - if mask.any(): + if mask is not None and mask.any(): # mask on original values computed separately values = values.copy() values[mask] = True diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 582e1f96fa562..749a5fea4d513 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -504,7 +504,8 @@ def __new__( elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here - pass + if isinstance(data, (set, frozenset)): + data = list(data) elif is_ea_or_datetimelike_dtype(data_dtype): pass @@ -4152,7 +4153,8 @@ def reindex( preserve_names = not hasattr(target, "name") # GH7774: preserve dtype/tz if target is empty and not an Index. - target = ensure_has_len(target) # target may be an iterator + if is_iterator(target): + target = list(target) if not isinstance(target, Index) and len(target) == 0: if level is not None and self._is_multi: @@ -6261,7 +6263,11 @@ def _should_compare(self, other: Index) -> bool: return False dtype = _unpack_nested_dtype(other) - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) + return ( + self._is_comparable_dtype(dtype) + or is_object_dtype(dtype) + or is_string_dtype(dtype) + ) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ @@ -6877,6 +6883,9 @@ def insert(self, loc: int, item) -> Index: # We cannot keep the same dtype, so cast to the (often object) # minimal shared dtype before doing the insert. dtype = self._find_common_type_compat(item) + if dtype == self.dtype: + # EA's might run into recursion errors if loc is invalid + raise return self.astype(dtype).insert(loc, item) if arr.dtype != object or not isinstance( @@ -7560,21 +7569,9 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: return Index(index_like, copy=copy) -def ensure_has_len(seq): - """ - If seq is an iterator, put its values into a list. - """ - try: - len(seq) - except TypeError: - return list(seq) - else: - return seq - - def trim_front(strings: list[str]) -> list[str]: """ - Trims zeros and decimal points. + Trims leading spaces evenly among all strings. Examples -------- @@ -7586,8 +7583,9 @@ def trim_front(strings: list[str]) -> list[str]: """ if not strings: return strings - while all(strings) and all(x[0] == " " for x in strings): - strings = [x[1:] for x in strings] + smallest_leading_space = min(len(x) - len(x.lstrip()) for x in strings) + if smallest_leading_space > 0: + strings = [x[smallest_leading_space:] for x in strings] return strings diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 3b3cda8f7cd33..536f22d38468d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -875,6 +875,7 @@ def date_range( Returns ------- DatetimeIndex + A DatetimeIndex object of the generated dates. See Also -------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 359cdf880937b..8feac890883eb 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -51,6 +51,7 @@ is_number, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -712,7 +713,7 @@ def _get_indexer( # left/right get_indexer, compare elementwise, equality -> match indexer = self._get_indexer_unique_sides(target) - elif not is_object_dtype(target.dtype): + elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)): # homogeneous scalar index: use IntervalTree # we should always have self._should_partial_index(target) here target = self._maybe_convert_i8(target) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b5f05ef0ab78f..377406e24b1d3 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -563,6 +563,14 @@ def period_range( Returns ------- PeriodIndex + A PeriodIndex of fixed frequency periods. + + See Also + -------- + date_range : Returns a fixed frequency DatetimeIndex. + Period : Represents a period of time. + PeriodIndex : Immutable ndarray holding ordinal values indicating regular periods + in time. Notes ----- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b11ce6bd7b919..dc96d1c11db74 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -295,6 +295,16 @@ def start(self) -> int: """ The value of the `start` parameter (``0`` if this was not supplied). + This property returns the starting value of the `RangeIndex`. If the `start` + value is not explicitly provided during the creation of the `RangeIndex`, + it defaults to 0. + + See Also + -------- + RangeIndex : Immutable index implementing a range-based index. + RangeIndex.stop : Returns the stop value of the `RangeIndex`. + RangeIndex.step : Returns the step value of the `RangeIndex`. + Examples -------- >>> idx = pd.RangeIndex(5) @@ -313,6 +323,17 @@ def stop(self) -> int: """ The value of the `stop` parameter. + This property returns the `stop` value of the RangeIndex, which defines the + upper (or lower, in case of negative steps) bound of the index range. The + `stop` value is exclusive, meaning the RangeIndex includes values up to but + not including this value. + + See Also + -------- + RangeIndex : Immutable index representing a range of integers. + RangeIndex.start : The start value of the RangeIndex. + RangeIndex.step : The step size between elements in the RangeIndex. + Examples -------- >>> idx = pd.RangeIndex(5) @@ -330,6 +351,15 @@ def step(self) -> int: """ The value of the `step` parameter (``1`` if this was not supplied). + The ``step`` parameter determines the increment (or decrement in the case + of negative values) between consecutive elements in the ``RangeIndex``. + + See Also + -------- + RangeIndex : Immutable index implementing a range-based index. + RangeIndex.stop : Returns the stop value of the RangeIndex. + RangeIndex.start : Returns the start value of the RangeIndex. + Examples -------- >>> idx = pd.RangeIndex(5) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index dced92ba04520..cb40e920149fa 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -915,7 +915,7 @@ def _replace_coerce( nb = nb.copy() putmask_inplace(nb.values, mask, value) return [nb] - return [self] + return [self.copy(deep=False)] return self.replace( to_replace=to_replace, value=value, diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 07465e7b87fcd..959e572b2b35b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -750,7 +750,8 @@ def to_arrays( elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray - columns = Index(list(data.dtype.names)) + if columns is None: + columns = Index(data.dtype.names) arrays = [data[k] for k in columns] return arrays, columns diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b621fcf9a6415..711396096a5e3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -529,6 +529,11 @@ def ffill(self, limit: int | None = None): """ Forward fill the values. + This method fills missing values by propagating the last valid + observation forward, up to the next valid observation. It is commonly + used in time series analysis when resampling data to a higher frequency + (upsampling) and filling gaps in the resampled output. + Parameters ---------- limit : int, optional @@ -536,7 +541,8 @@ def ffill(self, limit: int | None = None): Returns ------- - An upsampled Series. + Series + The resampled data with missing values filled forward. See Also -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 4f79e30f48f3c..bbcb6615aeefd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -11,6 +11,7 @@ Mapping, Sequence, ) +import functools import operator import sys from textwrap import dedent @@ -580,8 +581,15 @@ def __arrow_c_stream__(self, requested_schema=None): PyCapsule """ pa = import_optional_dependency("pyarrow", min_version="16.0.0") - ca = pa.chunked_array([pa.Array.from_pandas(self, type=requested_schema)]) - return ca.__arrow_c_stream__(requested_schema) + type = ( + pa.DataType._import_from_c_capsule(requested_schema) + if requested_schema is not None + else None + ) + ca = pa.array(self, type=type) + if not isinstance(ca, pa.ChunkedArray): + ca = pa.chunked_array([ca]) + return ca.__arrow_c_stream__() # ---------------------------------------------------------------------- @@ -4305,6 +4313,7 @@ def map( self, arg: Callable | Mapping | Series, na_action: Literal["ignore"] | None = None, + **kwargs, ) -> Series: """ Map values of Series according to an input mapping or function. @@ -4320,6 +4329,11 @@ def map( na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the mapping correspondence. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `arg`. + + .. versionadded:: 3.0.0 Returns ------- @@ -4381,6 +4395,8 @@ def map( 3 I am a rabbit dtype: object """ + if callable(arg): + arg = functools.partial(arg, **kwargs) new_values = self._map_values(arg, na_action=na_action) return self._constructor(new_values, index=self.index, copy=False).__finalize__( self, method="map" diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index cb0c3d241534c..81fa508ae6d23 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -65,9 +65,9 @@ Determine which axis to align the comparison on. * 0, or 'index' : Resulting differences are stacked vertically - with rows drawn alternately from self and other. + with rows drawn alternately from self and other. * 1, or 'columns' : Resulting differences are aligned horizontally - with columns drawn alternately from self and other. + with columns drawn alternately from self and other. keep_shape : bool, default False If true, all rows and columns are kept. diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index bdb88e981bcda..10117aa6bf503 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, + is_extension_array_dtype, is_integer, is_list_like, is_object_dtype, @@ -54,6 +55,8 @@ Iterator, ) + from pandas._typing import NpDtype + from pandas import ( DataFrame, Index, @@ -2431,7 +2434,11 @@ def wrap( return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep: str = "|"): + def get_dummies( + self, + sep: str = "|", + dtype: NpDtype | None = None, + ): """ Return DataFrame of dummy/indicator variables for Series. @@ -2442,6 +2449,8 @@ def get_dummies(self, sep: str = "|"): ---------- sep : str, default "|" String to split on. + dtype : dtype, default np.int64 + Data type for new columns. Only a single dtype is allowed. Returns ------- @@ -2466,10 +2475,24 @@ def get_dummies(self, sep: str = "|"): 0 1 1 0 1 0 0 0 2 1 0 1 + + >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool) + a b c + 0 True True False + 1 False False False + 2 True False True """ + from pandas.core.frame import DataFrame + # we need to cast to Series of strings as only that has all # methods available for making the dummies... - result, name = self._data.array._str_get_dummies(sep) + result, name = self._data.array._str_get_dummies(sep, dtype) + if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype): + return self._wrap_result( + DataFrame(result, columns=name, dtype=dtype), + name=name, + returns_string=False, + ) return self._wrap_result( result, name=name, @@ -3420,10 +3443,10 @@ def casefold(self): Series or Index of bool Series or Index of boolean values with the same length as the original Series/Index. - + """ + _shared_docs["isalpha"] = """ See Also -------- - Series.str.isalpha : Check whether all characters are alphabetic. Series.str.isnumeric : Check whether all characters are numeric. Series.str.isalnum : Check whether all characters are alphanumeric. Series.str.isdigit : Check whether all characters are digits. @@ -3435,24 +3458,56 @@ def casefold(self): Examples -------- - **Checks for Alphabetic and Numeric Characters** >>> s1 = pd.Series(['one', 'one1', '1', '']) - >>> s1.str.isalpha() 0 True 1 False 2 False 3 False dtype: bool + """ + _shared_docs["isnumeric"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + Examples + -------- + The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but + also includes other characters that can represent quantities such as + unicode fractions. + + >>> s1 = pd.Series(['one', 'one1', '1', '']) >>> s1.str.isnumeric() 0 False 1 False 2 True 3 False dtype: bool + """ + _shared_docs["isalnum"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + Examples + -------- + >>> s1 = pd.Series(['one', 'one1', '1', '']) >>> s1.str.isalnum() 0 True 1 True @@ -3469,47 +3524,72 @@ def casefold(self): 1 False 2 False dtype: bool + """ + _shared_docs["isdecimal"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **More Detailed Checks for Numeric Characters** - - There are several different but overlapping sets of numeric characters that - can be checked for. + Examples + -------- + The ``s3.str.isdecimal`` method checks for characters used to form + numbers in base 10. >>> s3 = pd.Series(['23', '³', '⅕', '']) - - The ``s3.str.isdecimal`` method checks for characters used to form numbers - in base 10. - >>> s3.str.isdecimal() 0 True 1 False 2 False 3 False dtype: bool + """ + _shared_docs["isdigit"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also - includes special digits, like superscripted and subscripted digits in - unicode. + Examples + -------- + Similar to ``str.isdecimal`` but also includes special digits, like + superscripted and subscripted digits in unicode. + >>> s3 = pd.Series(['23', '³', '⅕', '']) >>> s3.str.isdigit() 0 True 1 True 2 False 3 False dtype: bool + """ - The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also - includes other characters that can represent quantities such as unicode - fractions. - - >>> s3.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool + _shared_docs["isspace"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **Checks for Whitespace** + Examples + -------- >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) >>> s4.str.isspace() @@ -3517,30 +3597,74 @@ def casefold(self): 1 True 2 False dtype: bool + """ + _shared_docs["islower"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **Checks for Character Case** + Examples + -------- >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - >>> s5.str.islower() 0 True 1 False 2 False 3 False dtype: bool + """ + + _shared_docs["isupper"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.istitle : Check whether all characters are titlecase. + + Examples + -------- + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s5.str.isupper() 0 False 1 False 2 True 3 False dtype: bool + """ + _shared_docs["istitle"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Examples + ------------ The ``s5.str.istitle`` method checks for whether all words are in title case (whether only the first letter of each word is capitalized). Words are assumed to be as any sequence of non-numeric characters separated by whitespace characters. + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s5.str.istitle() 0 False 1 True @@ -3560,31 +3684,49 @@ def casefold(self): # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) isalnum = _map_and_wrap( - "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + "isalnum", + docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + + _shared_docs["isalnum"], ) isalpha = _map_and_wrap( - "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + "isalpha", + docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + + _shared_docs["isalpha"], ) isdigit = _map_and_wrap( - "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + "isdigit", + docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + + _shared_docs["isdigit"], ) isspace = _map_and_wrap( - "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"] + "isspace", + docstring=_shared_docs["ismethods"] % _doc_args["isspace"] + + _shared_docs["isspace"], ) islower = _map_and_wrap( - "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] + "islower", + docstring=_shared_docs["ismethods"] % _doc_args["islower"] + + _shared_docs["islower"], ) isupper = _map_and_wrap( - "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + "isupper", + docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + + _shared_docs["isupper"], ) istitle = _map_and_wrap( - "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + "istitle", + docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + + _shared_docs["istitle"], ) isnumeric = _map_and_wrap( - "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + "isnumeric", + docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + + _shared_docs["isnumeric"], ) isdecimal = _map_and_wrap( - "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + "isdecimal", + docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + + _shared_docs["isdecimal"], ) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 1281a03e297f9..97d906e3df077 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -16,6 +16,7 @@ import re from pandas._typing import ( + NpDtype, Scalar, Self, ) @@ -163,7 +164,7 @@ def _str_wrap(self, width: int, **kwargs): pass @abc.abstractmethod - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): pass @abc.abstractmethod diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index c6b18d7049c57..6211c7b528db9 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -18,6 +18,7 @@ import pandas._libs.ops as libops from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.missing import isna from pandas.core.strings.base import BaseStringArrayMethods @@ -398,9 +399,11 @@ def _str_wrap(self, width: int, **kwargs): tw = textwrap.TextWrapper(**kwargs) return self._str_map(lambda s: "\n".join(tw.wrap(s))) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): from pandas import Series + if dtype is None: + dtype = np.int64 arr = Series(self).fillna("") try: arr = sep + arr + sep @@ -412,7 +415,13 @@ def _str_get_dummies(self, sep: str = "|"): tags.update(ts) tags2 = sorted(tags - {""}) - dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype) def _isin(test_elements: str, element: str) -> bool: return element in test_elements diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 982851d0557c3..f159babb7e018 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -11,6 +11,10 @@ lib, missing as libmissing, ) +from pandas._libs.tslibs import ( + Timedelta, + Timestamp, +) from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -189,6 +193,8 @@ def to_numeric( return float(arg) if is_number(arg): return arg + if isinstance(arg, (Timedelta, Timestamp)): + return arg._value is_scalars = True values = np.array([arg], dtype="O") elif getattr(arg, "ndim", 1) > 1: diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 2f625090e0492..46e090cc3a589 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -205,6 +205,17 @@ class EmptyDataError(ValueError): """ Exception raised in ``pd.read_csv`` when empty data or header is encountered. + This error is typically encountered when attempting to read an empty file or + an invalid file where no data or headers are present. + + See Also + -------- + read_csv : Read a comma-separated values (CSV) file into DataFrame. + errors.ParserError : Exception that is raised by an error encountered in parsing + file contents. + errors.DtypeWarning : Warning raised when reading different dtypes in a column + from a file. + Examples -------- >>> from io import StringIO @@ -261,6 +272,11 @@ class MergeError(ValueError): Subclass of ``ValueError``. + See Also + -------- + DataFrame.join : For joining DataFrames on their indexes. + merge : For merging two DataFrames on a common set of keys. + Examples -------- >>> left = pd.DataFrame( @@ -599,6 +615,16 @@ class ClosedFileError(Exception): """ Exception is raised when trying to perform an operation on a closed HDFStore file. + ``ClosedFileError`` is specific to operations on ``HDFStore`` objects. Once an + HDFStore is closed, its resources are no longer available, and any further attempt + to access data or perform file operations will raise this exception. + + See Also + -------- + HDFStore.close : Closes the PyTables file handle. + HDFStore.open : Opens the file in the specified mode. + HDFStore.is_open : Returns a boolean indicating whether the file is open. + Examples -------- >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index adaeed017d7bf..fdea1831d5596 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -195,6 +195,8 @@ def _write_cell( esc = {} rs = pprint_thing(s, escape_chars=esc).strip() + # replace spaces betweens strings with non-breaking spaces + rs = rs.replace(" ", "  ") if self.render_links and is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 8a6383f7e8f82..08d9fd938c873 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -906,9 +906,9 @@ def concatenated_visible_rows(obj): row_body_headers = [ { **col, - "display_value": col["display_value"] - if col["is_visible"] - else "", + "display_value": ( + col["display_value"] if col["is_visible"] else "" + ), "cellstyle": self.ctx_index[r, c], } for c, col in enumerate(row[:index_levels]) @@ -2069,18 +2069,18 @@ def maybe_convert_css_to_tuples(style: CSSProperties) -> CSSList: ('border','1px solid red')] """ if isinstance(style, str): - s = style.split(";") - try: - return [ - (x.split(":")[0].strip(), x.split(":")[1].strip()) - for x in s - if x.strip() != "" - ] - except IndexError as err: + if style and ":" not in style: raise ValueError( "Styles supplied as string must follow CSS rule formats, " f"for example 'attr: val;'. '{style}' was given." - ) from err + ) + s = style.split(";") + return [ + (x.split(":")[0].strip(), ":".join(x.split(":")[1:]).strip()) + for x in s + if x.strip() != "" + ] + return style diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index d077b9e0c4568..e9c9f5ba225a5 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1168,6 +1168,7 @@ def _try_convert_data( """ Try to parse a Series into a column by inferring dtype. """ + org_data = data # don't try to coerce, unless a force conversion if use_dtypes: if not self.dtype: @@ -1222,7 +1223,7 @@ def _try_convert_data( if len(data) and data.dtype in ("float", "object"): # coerce ints if we can try: - new_data = data.astype("int64") + new_data = org_data.astype("int64") if (new_data == data).all(): data = new_data converted = True diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 2916e4d98cce4..ffc2690a5efdf 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1648,7 +1648,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = T if keep_default_na: v = set(v) | STR_NA_VALUES - na_values[k] = v + na_values[k] = _stringify_na_values(v, floatify) na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} else: if not is_list_like(na_values): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index be7b8dc6640ba..618254fee9259 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3580,7 +3580,7 @@ def is_transposed(self) -> bool: @property def data_orientation(self) -> tuple[int, ...]: - """return a tuple of my permuted axes, non_indexable at the front""" + """return a tuple of my permutated axes, non_indexable at the front""" return tuple( itertools.chain( [int(a[0]) for a in self.non_index_axes], diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4be06f93689f2..6b988d8fed6bf 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2004,6 +2004,16 @@ def data_label(self) -> str: """ Return data label of Stata file. + The data label is a descriptive string associated with the dataset + stored in the Stata file. This property provides access to that + label, if one is present. + + See Also + -------- + io.stata.StataReader.variable_labels : Return a dict associating each variable + name with corresponding label. + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Examples -------- >>> df = pd.DataFrame([(1,)], columns=["variable"]) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3be3562d23cd6..f0ab01e9e960e 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -65,7 +65,6 @@ def test_apply(float_frame, engine, request): assert result.index is float_frame.index -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("nopython", [True, False]) @@ -742,8 +741,9 @@ def test_apply_category_equalness(val): result = df.a.apply(lambda x: x == val) expected = Series( - [np.nan if pd.isnull(x) else x == val for x in df_values], name="a" + [False if pd.isnull(x) else x == val for x in df_values], name="a" ) + # False since behavior of NaN for categorical dtype has been changed (GH 59966) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index d86eeadbaa0fe..d6cd9c321ace6 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,10 +1,9 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, Index, @@ -19,7 +18,6 @@ def apply_axis(request): return request.param -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) @@ -29,11 +27,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis): def test_numba_vs_python_string_index(): # GH#56189 - pytest.importorskip("pyarrow") df = DataFrame( 1, - index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)), ) func = lambda x: x result = df.apply(func, engine="numba", axis=0) @@ -43,7 +40,6 @@ def test_numba_vs_python_string_index(): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 76704de6f2d10..9541b0b7495c7 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -236,10 +236,10 @@ def test_apply_categorical_with_nan_values(series, by_row): with pytest.raises(AttributeError, match=msg): s.apply(lambda x: x.split("-")[0], by_row=by_row) return - - result = s.apply(lambda x: x.split("-")[0], by_row=by_row) + # NaN for cat dtype fixed in (GH 59966) + result = s.apply(lambda x: x.split("-")[0] if pd.notna(x) else False, by_row=by_row) result = result.astype(object) - expected = Series(["1", "1", np.nan], dtype="category") + expected = Series(["1", "1", False], dtype="category") expected = expected.astype(object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index d7264c002c67f..74cc3e991bb76 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -28,10 +28,12 @@ def test_mixing_naive_tzaware_raises(self, meth): # GH#24569 arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]) - msg = ( - "Cannot mix tz-aware with tz-naive values|" - "Tz-aware datetime.datetime cannot be converted " - "to datetime64 unless utc=True" + msg = "|".join( + [ + "Cannot mix tz-aware with tz-naive values", + "Tz-aware datetime.datetime cannot be converted " + "to datetime64 unless utc=True", + ] ) for obj in [arr, arr[::-1]]: @@ -63,10 +65,10 @@ def test_bool_dtype_raises(self): def test_copy(self): data = np.array([1, 2, 3], dtype="M8[ns]") - arr = DatetimeArray._from_sequence(data, copy=False) + arr = DatetimeArray._from_sequence(data, dtype=data.dtype, copy=False) assert arr._ndarray is data - arr = DatetimeArray._from_sequence(data, copy=True) + arr = DatetimeArray._from_sequence(data, dtype=data.dtype, copy=True) assert arr._ndarray is not data def test_numpy_datetime_unit(self, unit): @@ -163,7 +165,9 @@ def test_from_arrow_from_empty(unit, tz): dtype = DatetimeTZDtype(unit=unit, tz=tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]")) + expected = DatetimeArray._from_sequence( + np.array(data, dtype=f"datetime64[{unit}]"), dtype=np.dtype(f"M8[{unit}]") + ) expected = expected.tz_localize(tz=tz) tm.assert_extension_array_equal(result, expected) @@ -179,7 +183,9 @@ def test_from_arrow_from_integers(): dtype = DatetimeTZDtype(unit="ns", tz="UTC") result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]")) + expected = DatetimeArray._from_sequence( + np.array(data, dtype="datetime64[ns]"), dtype=np.dtype("M8[ns]") + ) expected = expected.tz_localize("UTC") tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ccf644b34051d..752ebe194ffcf 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -68,11 +68,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["0.1", "0.2", ""], dtype="U32") diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index fadd7ac67b58d..7972ba7b9fb0f 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -281,11 +281,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 58ba340441d86..8e13dcf25ceba 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -222,9 +222,10 @@ def test_min_max(self, left_right_dtypes, index_or_series_or_array): res = arr_na.max(skipna=False) assert np.isnan(res) - res = arr_na.min(skipna=True) - assert res == MIN - assert type(res) == type(MIN) - res = arr_na.max(skipna=True) - assert res == MAX - assert type(res) == type(MAX) + for kws in [{"skipna": True}, {}]: + res = arr_na.min(**kws) + assert res == MIN + assert type(res) == type(MIN) + res = arr_na.max(**kws) + assert res == MAX + assert type(res) == type(MAX) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index bd3298940ae3a..08bfd5b69fdd9 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -252,3 +252,7 @@ def test_with_column_named_sparse(self): # https://github.com/pandas-dev/pandas/issues/30758 df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor) + + def test_subclassing(self): + df = tm.SubclassedDataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) + assert isinstance(df.sparse.to_dense(), tm.SubclassedDataFrame) diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py index 83a507e679d46..e6e4a11a0f5ab 100644 --- a/pandas/tests/arrays/sparse/test_astype.py +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype): ), ( SparseArray([0, 1, 10]), - str, - SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + np.str_, + SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")), ), (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), ( diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 1819744d9a9ae..6143163735ab8 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -184,7 +184,7 @@ def test_construct_from_string_fill_value_raises(string): [ (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), - (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")), (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), ], ) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index dd87dbf8e9a43..33708be497f31 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -30,6 +30,12 @@ def dtype(string_dtype_arguments): return pd.StringDtype(storage=storage, na_value=na_value) +@pytest.fixture +def dtype2(string_dtype_arguments2): + storage, na_value = string_dtype_arguments2 + return pd.StringDtype(storage=storage, na_value=na_value) + + @pytest.fixture def cls(dtype): """Fixture giving array type from parametrized 'dtype'""" @@ -102,10 +108,7 @@ def test_setitem_validates(cls, dtype): with pytest.raises(TypeError, match=msg): arr[0] = 10 - if dtype.storage == "python": - msg = "Must provide strings." - else: - msg = "Scalar must be NA or str" + msg = "Must provide strings" with pytest.raises(TypeError, match=msg): arr[:] = np.array([1, 2]) @@ -665,11 +668,7 @@ def test_isin(dtype, fixed_now_ts): tm.assert_series_equal(result, expected) result = s.isin(["a", pd.NA]) - if dtype.storage == "python" and dtype.na_value is np.nan: - # TODO(infer_string) we should make this consistent - expected = pd.Series([True, False, False]) - else: - expected = pd.Series([True, False, True]) + expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) result = s.isin([]) @@ -680,6 +679,35 @@ def test_isin(dtype, fixed_now_ts): expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) + result = s.isin([fixed_now_ts]) + expected = pd.Series([False, False, False]) + tm.assert_series_equal(result, expected) + + +def test_isin_string_array(dtype, dtype2): + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=dtype2)) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=dtype2)) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + + +def test_isin_arrow_string_array(dtype): + pa = pytest.importorskip("pyarrow") + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + def test_setitem_scalar_with_mask_validation(dtype): # https://github.com/pandas-dev/pandas/issues/47628 diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index b042cf632288b..d4363171788d4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises(): arr[[0, 1]] = ["foo", "bar", "baz"] -@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) -def test_pickle_roundtrip(dtype): +@pytest.mark.parametrize("na_value", [pd.NA, np.nan]) +def test_pickle_roundtrip(na_value): # GH 42600 pytest.importorskip("pyarrow") + dtype = StringDtype("pyarrow", na_value=na_value) expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 4070a2844846f..3c0ef1e4d928b 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -370,11 +370,15 @@ def test_array_copy(): ), ( np.array([1, 2], dtype="m8[ns]"), - TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")), + TimedeltaArray._from_sequence( + np.array([1, 2], dtype="m8[ns]"), dtype=np.dtype("m8[ns]") + ), ), ( np.array([1, 2], dtype="m8[us]"), - TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")), + TimedeltaArray._from_sequence( + np.array([1, 2], dtype="m8[us]"), dtype=np.dtype("m8[us]") + ), ), # integer ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 6dd1ef9d59ab4..0c8eefab95464 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -257,7 +257,8 @@ def test_fillna_method_doesnt_change_orig(self, method): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls._from_sequence(data) + dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]" + arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype)) arr[4] = NaT fill_value = arr[3] if method == "pad" else arr[5] @@ -273,7 +274,8 @@ def test_searchsorted(self): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls._from_sequence(data) + dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]" + arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype)) # scalar result = arr.searchsorted(arr[1]) @@ -739,10 +741,10 @@ def test_array_i8_dtype(self, arr1d): def test_from_array_keeps_base(self): # Ensure that DatetimeArray._ndarray.base isn't lost. arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") - dta = DatetimeArray._from_sequence(arr) + dta = DatetimeArray._from_sequence(arr, dtype=arr.dtype) assert dta._ndarray is arr - dta = DatetimeArray._from_sequence(arr[:0]) + dta = DatetimeArray._from_sequence(arr[:0], dtype=arr.dtype) assert dta._ndarray.base is arr def test_from_dti(self, arr1d): diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8e348805de978..e3f49d04a0ff2 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -499,7 +499,7 @@ def test_value_counts_preserves_tz(self): @pytest.mark.parametrize("method", ["pad", "backfill"]) def test_fillna_preserves_tz(self, method): dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central") - arr = DatetimeArray._from_sequence(dti, copy=True) + arr = DatetimeArray._from_sequence(dti, dtype=dti.dtype, copy=True) arr[2] = pd.NaT fill_val = dti[1] if method == "pad" else dti[3] @@ -665,7 +665,9 @@ def test_shift_fill_value(self): dti = pd.date_range("2016-01-01", periods=3) dta = dti._data - expected = DatetimeArray._from_sequence(np.roll(dta._ndarray, 1)) + expected = DatetimeArray._from_sequence( + np.roll(dta._ndarray, 1), dtype=dti.dtype + ) fv = dta[-1] for fill_value in [fv, fv.to_pydatetime(), fv.to_datetime64()]: @@ -731,7 +733,11 @@ def test_iter_zoneinfo_fold(self, tz): ) utc_vals *= 1_000_000_000 - dta = DatetimeArray._from_sequence(utc_vals).tz_localize("UTC").tz_convert(tz) + dta = ( + DatetimeArray._from_sequence(utc_vals, dtype=np.dtype("M8[ns]")) + .tz_localize("UTC") + .tz_convert(tz) + ) left = dta[2] right = list(dta)[2] diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index bcc52f197ee51..fb7c7afdc6ff9 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -263,10 +263,10 @@ def test_searchsorted_invalid_types(self, other, index): class TestUnaryOps: def test_abs(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) evals = np.array([3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - expected = TimedeltaArray._from_sequence(evals) + expected = TimedeltaArray._from_sequence(evals, dtype=evals.dtype) result = abs(arr) tm.assert_timedelta_array_equal(result, expected) @@ -276,7 +276,7 @@ def test_abs(self): def test_pos(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) result = +arr tm.assert_timedelta_array_equal(result, arr) @@ -288,7 +288,7 @@ def test_pos(self): def test_neg(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) evals = np.array([3600 * 10**9, "NaT", -7200 * 10**9], dtype="m8[ns]") expected = TimedeltaArray._from_sequence(evals) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 13a3ff048c79e..d8af7abe83084 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -333,7 +333,8 @@ def test_array_multiindex_raises(): # Timedelta ( TimedeltaArray._from_sequence( - np.array([0, 3600000000000], dtype="i8").view("m8[ns]") + np.array([0, 3600000000000], dtype="i8").view("m8[ns]"), + dtype=np.dtype("m8[ns]"), ), np.array([0, 3600000000000], dtype="m8[ns]"), ), diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index bbd9b150b88a8..7819b7b75f065 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -183,9 +183,7 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( - index.dtype, "string[pyarrow_numpy]" - ): + if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow": msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 31d568d7c1e0c..3c0bf6c35866c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1998,3 +1998,11 @@ def test_validate_bool_args(value): msg = 'For argument "inplace" expected type bool, received type' with pytest.raises(ValueError, match=msg): pd.eval("2+2", inplace=value) + + +@td.skip_if_no("numexpr") +def test_eval_float_div_numexpr(): + # GH 59736 + result = pd.eval("1 / 2", engine="numexpr") + expected = 0.5 + assert result == expected diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index de56d5e4a07ee..80c30f2d0c26e 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -7,7 +7,6 @@ from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 -import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -111,7 +110,8 @@ def test_astype_string_and_object_update_original(dtype, new_dtype): tm.assert_frame_equal(df2, df_orig) -def test_astype_string_copy_on_pickle_roundrip(): +def test_astype_str_copy_on_pickle_roundrip(): + # TODO(infer_string) this test can be removed after 3.0 (once str is the default) # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) @@ -120,14 +120,22 @@ def test_astype_string_copy_on_pickle_roundrip(): tm.assert_series_equal(base, base_copy) -@td.skip_if_no("pyarrow") -def test_astype_string_read_only_on_pickle_roundrip(): +def test_astype_string_copy_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + +def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter read-only array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) base_copy = pickle.loads(pickle.dumps(base)) base_copy._values.flags.writeable = False - base_copy.astype("string[pyarrow]") + base_copy.astype(any_string_dtype) tm.assert_series_equal(base, base_copy) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 58c979fb05089..a8acd446ff5f5 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -286,6 +286,12 @@ def test_replace_list_none(): assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + # replace multiple values that don't actually replace anything with None + # https://github.com/pandas-dev/pandas/issues/59770 + df3 = df.replace(["d", "e", "f"], value=None) + tm.assert_frame_equal(df3, df_orig) + assert tm.shares_memory(get_array(df, "a"), get_array(df3, "a")) + def test_replace_list_none_inplace_refs(): df = DataFrame({"a": ["a", "b", "c"]}) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 2c2dff7a957fe..e338fb1331734 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -810,11 +810,23 @@ def test_pandas_dtype_string_dtypes(string_storage): "pyarrow" if HAS_PYARROW else "python", na_value=np.nan ) + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype(str) + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype(str) + assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 261f86bfb0326..2b90886a8d070 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -20,8 +20,8 @@ class TestABCClasses: df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) sparse_array = pd.arrays.SparseArray(np.random.default_rng(2).standard_normal(10)) - datetime_array = pd.core.arrays.DatetimeArray._from_sequence(datetime_index) - timedelta_array = pd.core.arrays.TimedeltaArray._from_sequence(timedelta_index) + datetime_array = datetime_index.array + timedelta_array = timedelta_index.array abc_pairs = [ ("ABCMultiIndex", multi_index), diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index f86ed6f49759f..73c462d492d2d 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -1,4 +1,3 @@ -from contextlib import nullcontext from datetime import datetime from decimal import Decimal @@ -7,7 +6,6 @@ from pandas._libs import missing as libmissing from pandas._libs.tslibs import iNaT -from pandas.compat.numpy import np_version_gte1p25 from pandas.core.dtypes.common import ( is_float, @@ -458,15 +456,7 @@ def test_array_equivalent_dti(dtype_equal): ) def test_array_equivalent_series(val): arr = np.array([1, 2]) - msg = "elementwise comparison failed" - cm = ( - # stacklevel is chosen to make sense when called from .equals - tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False) - if isinstance(val, str) and not np_version_gte1p25 - else nullcontext() - ) - with cm: - assert not array_equivalent(Series([arr, arr]), Series([arr, val])) + assert not array_equivalent(Series([arr, arr]), Series([arr, val])) def test_array_equivalent_array_mismatched_shape(): diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index e924e38ee5030..8e3f21e1a4f56 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -44,8 +44,8 @@ def test_tolist(self, data): assert result == expected def test_astype_str(self, data): - result = pd.Series(data[:5]).astype(str) - expected = pd.Series([str(x) for x in data[:5]], dtype=str) + result = pd.Series(data[:2]).astype(str) + expected = pd.Series([str(x) for x in data[:2]], dtype=str) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index dd2ed0bd62a02..fd9fec0cb490c 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -549,7 +549,7 @@ def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series): dtype = data_for_sorting.dtype data_for_sorting = pd.array([True, False], dtype=dtype) b, a = data_for_sorting - arr = type(data_for_sorting)._from_sequence([a, b]) + arr = type(data_for_sorting)._from_sequence([a, b], dtype=dtype) if as_series: arr = pd.Series(arr) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 3a4391edc99ef..4fa48023fbc95 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -208,9 +208,8 @@ def astype(self, dtype, copy=True): return self.copy() return self elif isinstance(dtype, StringDtype): - value = self.astype(str) # numpy doesn't like nested dicts arr_cls = dtype.construct_array_type() - return arr_cls._from_sequence(value, dtype=dtype, copy=False) + return arr_cls._from_sequence(self, dtype=dtype, copy=False) elif not copy: return np.asarray([dict(x) for x in self], dtype=dtype) else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fc4f14882b9d7..f56094dfd47ca 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -43,7 +43,6 @@ pa_version_under13p0, pa_version_under14p0, ) -import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -292,7 +291,7 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) - def test_astype_str(self, data, request): + def test_astype_str(self, data, request, using_infer_string): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): request.applymarker( @@ -300,9 +299,10 @@ def test_astype_str(self, data, request): reason=f"For {pa_dtype} .astype(str) decodes.", ) ) - elif ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): + elif not using_infer_string and ( + (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + or pa.types.is_duration(pa_dtype) + ): request.applymarker( pytest.mark.xfail( reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", @@ -310,25 +310,6 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) - @pytest.mark.parametrize( - "nullable_string_dtype", - [ - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], - ) - def test_astype_string(self, data, nullable_string_dtype, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): - request.applymarker( - pytest.mark.xfail( - reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", - ) - ) - super().test_astype_string(data, nullable_string_dtype) - def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype): @@ -2036,6 +2017,7 @@ def test_str_join_string_type(): [None, 2, None, ["ab", None]], [None, 2, 1, ["ab", None]], [1, 3, 1, ["bc", None]], + (None, None, -1, ["dcba", None]), ], ) def test_str_slice(start, stop, step, exp): diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 56c023d99bb1c..b7685a61d4937 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -340,11 +340,16 @@ def test_argmin_argmax_all_na(self, method, data, na_value): self._check_unsupported(data) super().test_argmin_argmax_all_na(method, data, na_value) + @pytest.mark.fails_arm_wheels @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) def test_equals(self, data, na_value, as_series, box): self._check_unsupported(data) super().test_equals(data, na_value, as_series, box) + @pytest.mark.fails_arm_wheels + def test_equals_same_data_different_object(self, data): + super().test_equals_same_data_different_object(data) + @pytest.mark.parametrize( "func, na_action, expected", [ diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 17f6eb8282b23..509ae653e4793 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -209,7 +209,6 @@ def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, "abc") - @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index abc3aab1c1492..1d4a2c0075e3e 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -469,3 +469,26 @@ def test_from_records_empty2(self): alt = DataFrame(arr) tm.assert_frame_equal(alt, expected) + + def test_from_records_structured_array(self): + # GH 59717 + data = np.array( + [ + ("John", 25, "New York", 50000), + ("Jane", 30, "San Francisco", 75000), + ("Bob", 35, "Chicago", 65000), + ("Alice", 28, "Los Angeles", 60000), + ], + dtype=[("name", "U10"), ("age", "i4"), ("city", "U15"), ("salary", "i4")], + ) + + actual_result = DataFrame.from_records(data, columns=["name", "salary", "city"]) + + modified_data = { + "name": ["John", "Jane", "Bob", "Alice"], + "salary": np.array([50000, 75000, 65000, 60000], dtype="int32"), + "city": ["New York", "San Francisco", "Chicago", "Los Angeles"], + } + expected_result = DataFrame(modified_data) + + tm.assert_frame_equal(actual_result, expected_result) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 8ce4e8725d632..0723c3c70091c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1864,13 +1864,11 @@ def test_adding_new_conditional_column() -> None: ("dtype", "infer_string"), [ (object, False), - ("string[pyarrow_numpy]", True), + (pd.StringDtype(na_value=np.nan), True), ], ) def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: # https://github.com/pandas-dev/pandas/issues/56204 - pytest.importorskip("pyarrow") - df = DataFrame({"a": [1, 2], "b": [3, 4]}) with pd.option_context("future.infer_string", infer_string): df.loc[df["a"] == 1, "c"] = "1" @@ -1880,16 +1878,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: tm.assert_frame_equal(df, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_new_column_infer_string(): # GH#55366 - pytest.importorskip("pyarrow") df = DataFrame({"x": [1]}) with pd.option_context("future.infer_string", True): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( - {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype=object), + {"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))}, + columns=Index(["x", "y"], dtype="str"), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 8647df0e8ad96..ab3743283ea13 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -168,21 +168,21 @@ def test_astype_str(self): "d": list(map(str, d._values)), "e": list(map(str, e._values)), }, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) - def test_astype_str_float(self): + def test_astype_str_float(self, using_infer_string): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"], dtype="object") + expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val], dtype="object") + expected = DataFrame([val], dtype="str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -284,7 +284,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: Series(vals[:, 0].astype(str), dtype=object), + 0: Series(vals[:, 0].astype(str), dtype="str"), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -647,9 +647,10 @@ def test_astype_dt64tz(self, timezone_frame): # dt64tz->dt64 deprecated timezone_frame.astype("datetime64[ns]") - def test_astype_dt64tz_to_str(self, timezone_frame): + def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string): # str formatting result = timezone_frame.astype(str) + na_value = np.nan if using_infer_string else "NaT" expected = DataFrame( [ [ @@ -657,7 +658,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], - ["2013-01-02", "NaT", "NaT"], + ["2013-01-02", na_value, na_value], [ "2013-01-03", "2013-01-03 00:00:00-05:00", @@ -665,7 +666,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 4b1435babe6b1..6c6c208ee0c78 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -6,13 +6,10 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.algos import ( Infinity, NegInfinity, ) -from pandas.compat import HAS_PYARROW from pandas import ( DataFrame, @@ -466,23 +463,10 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first( - self, - request, - frame_or_series, - na_option, - ascending, - expected, - using_infer_string, - ): + def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): obj = frame_or_series(["foo", "foo", None, "foo"]) - if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series): - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) - result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) - if using_infer_string and isinstance(obj, Series): - expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -502,14 +486,15 @@ def test_rank_mixed_axis_zero(self, data, expected): result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype, exp_dtype", - [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], - ) - def test_rank_string_dtype(self, dtype, exp_dtype): + def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 - pytest.importorskip("pyarrow") - obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") + exp_dtype = ( + "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64" + ) + if string_dtype_no_object.storage == "python": + # TODO nullable string[python] should also return nullable Int64 + exp_dtype = "float64" expected = Series([1, 2, None, 3], dtype=exp_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 875dca321635f..0354e9df3d168 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -99,6 +99,9 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ei = df[["a"]] tm.assert_frame_equal(ri, ei) + ri = df.select_dtypes(include=[str]) + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -358,7 +361,7 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): - if using_infer_string and dtype == "str": + if using_infer_string and (dtype == "str" or dtype is str): # this is tested below pytest.skip("Selecting string columns works with future strings") df = DataFrame( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0176a36fe78d7..0a924aa393be5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,7 +24,6 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError @@ -82,7 +81,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str), dtype=object) + expected = DataFrame(arr.astype(str), dtype="str") tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self): @@ -300,18 +299,38 @@ def test_constructor_dtype_nocast_view_2d_array(self): df2 = DataFrame(df.values, dtype=df[0].dtype) assert df2._mgr.blocks[0].values.flags.c_contiguous - @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies") - def test_1d_object_array_does_not_copy(self): + def test_1d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) - @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") - def test_2d_object_array_does_not_copy(self): + def test_2d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): @@ -1766,12 +1785,18 @@ def test_constructor_column_duplicates(self): tm.assert_frame_equal(idf, edf) - def test_constructor_empty_with_string_dtype(self): + def test_constructor_empty_with_string_dtype(self, using_infer_string): # GH 9428 expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object) + expected_str = DataFrame( + index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan) + ) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str) - tm.assert_frame_equal(df, expected) + if using_infer_string: + tm.assert_frame_equal(df, expected_str) + else: + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") @@ -2655,8 +2680,7 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2690,8 +2714,7 @@ def test_frame_string_inference(self): def test_frame_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2715,7 +2738,6 @@ def test_frame_string_inference_array_string_dtype(self): def test_frame_string_inference_block_dim(self): # GH#55363 - pytest.importorskip("pyarrow") with pd.option_context("future.infer_string", True): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index fa71153d01157..a574989860957 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( NumExprClobberingError, UndefinedVariableError, @@ -762,7 +760,6 @@ def test_inf(self, op, f, engine, parser): result = df.query(q, engine=engine, parser=parser) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture @@ -775,6 +772,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): tm.assert_frame_equal(result, expected) expected = DataFrame(df_index) + expected.columns = expected.columns.astype(object) result = df.reset_index().query('"2018-01-03 00:00:00+00" < time') tm.assert_frame_equal(result, expected) @@ -1072,7 +1070,7 @@ def test_query_with_string_columns(self, parser, engine): with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine, using_infer_string): + def test_object_array_eq_ne(self, parser, engine): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1081,14 +1079,11 @@ def test_object_array_eq_ne(self, parser, engine, using_infer_string): "d": np.random.default_rng(2).integers(9, size=12), } ) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query("a == b", parser=parser, engine=engine) + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - with tm.assert_produces_warning(warning): - res = df.query("a != b", parser=parser, engine=engine) + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1128,15 +1123,13 @@ def test_query_with_nested_special_character(self, parser, engine): ], ) def test_query_lex_compare_strings( - self, parser, engine, op, func, using_infer_string + self, parser, engine, op, func ): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) @@ -1400,7 +1393,6 @@ def test_expr_with_column_name_with_backtick(self): expected = df[df["a`b"] < 2] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_string_with_backticks(self): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) @@ -1408,7 +1400,6 @@ def test_expr_with_string_with_backticks(self): expected = df["```" < df["#backticks"]] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_string_with_backticked_substring_same_as_column_name(self): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) @@ -1439,7 +1430,6 @@ def test_expr_with_column_names_with_special_characters(self, col1, col2, expr): expected = df[df[col1] < df[col2]] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_no_backticks(self): # GH 59285 df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"]) @@ -1483,7 +1473,6 @@ def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self): ): df.query("`column-name` < 'It`s that\\'s \"quote\" #hash") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self): # GH 59285 df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) @@ -1491,7 +1480,6 @@ def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self expected = df[df["column-name"] < 'It`s that\'s "quote" #hash'] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(self): # GH 59285 df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index edeac642551a0..91200f53e36bd 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -3,8 +3,6 @@ from pandas._config import using_string_dtype -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Index, @@ -79,16 +77,9 @@ def test_size_series_masked_type_returns_Int64(dtype): @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_size_strings(dtype): +def test_size_strings(any_string_dtype): # GH#55627 + dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 1c2f98c3701d5..8f3022fbe551c 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -7,8 +7,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, CategoricalIndex, @@ -373,14 +371,6 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -398,9 +388,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, - dtype, + any_string_dtype, using_infer_string, ): + dtype = any_string_dtype education_df = education_df.astype(dtype) education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False @@ -417,6 +408,7 @@ def test_compound( expected["proportion"] = expected_count expected["proportion"] /= expected_group_size if dtype == "string[pyarrow]": + # TODO(nullable) also string[python] should return nullable dtypes expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 11b874d0b1608..6393468fb8ccd 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2466,20 +2466,13 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_by_column_values_with_same_starting_value(dtype): +def test_by_column_values_with_same_starting_value(any_string_dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": Series(["sad", "happy", "happy"], dtype=dtype), + "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index fc2a8a970010a..6bb2eaf89b5d7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1180,3 +1180,15 @@ def test_grouping_by_key_is_in_axis(): result = gb.sum() expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]}) tm.assert_frame_equal(result, expected) + + +def test_groupby_any_with_timedelta(): + # GH#59712 + df = DataFrame({"value": [pd.Timedelta(1), pd.NaT]}) + + result = df.groupby(np.array([0, 1], dtype=np.int64))["value"].any() + + expected = Series({0: True, 1: False}, name="value", dtype=bool) + expected.index = expected.index.astype(np.int64) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 8a421654cdf9b..a6ea1502103c5 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -714,10 +714,9 @@ def test_groupby_min_max_categorical(func): @pytest.mark.parametrize("func", ["min", "max"]) -def test_min_empty_string_dtype(func): +def test_min_empty_string_dtype(func, string_dtype_no_object): # GH#55619 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = string_dtype_no_object df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] result = getattr(df.groupby("a"), func)() expected = DataFrame( diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 6036eddce7a01..0896b97e8a40e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Index(["a", "b"], dtype=dtype) + expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) tm.assert_index_equal(ser, expected) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index e17e39a334acc..56cdca49cb2b0 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -57,12 +57,11 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) - def test_insert_none_into_string_numpy(self): + def test_insert_none_into_string_numpy(self, string_dtype_no_object): # GH#55365 - pytest.importorskip("pyarrow") - index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b", "c"], dtype=string_dtype_no_object) result = index.insert(-1, None) - expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index f9636ec19f2ec..0e9fb77d6e8dd 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Index, @@ -233,7 +231,6 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("first_list", [["b", "a"], []]) @pytest.mark.parametrize("second_list", [["a", "b"], []]) @pytest.mark.parametrize( @@ -243,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort): def test_union_name_preservation( self, first_list, second_list, first_name, second_name, expected_name, sort ): + expected_dtype = object if not first_list or not second_list else "str" first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) @@ -253,7 +251,7 @@ def test_union_name_preservation( expected = Index(sorted(vals), name=expected_name) tm.assert_index_equal(union, expected) else: - expected = Index(vals, name=expected_name) + expected = Index(vals, name=expected_name, dtype=expected_dtype) tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index 81dc3b3ecc45e..62be8903da206 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -101,13 +101,16 @@ def test_astype_tznaive_to_tzaware(self): # dt64->dt64tz deprecated idx._data.astype("datetime64[ns, US/Eastern]") - def test_astype_str_nat(self): + def test_astype_str_nat(self, using_infer_string): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) - expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) + if using_infer_string: + expected = Index(["2016-05-16", None, None, None], dtype="str") + else: + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) def test_astype_str(self): @@ -117,7 +120,7 @@ def test_astype_str(self): expected = Index( ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -132,7 +135,7 @@ def test_astype_str_tz_and_name(self): "2012-01-03 00:00:00-05:00", ], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -143,7 +146,7 @@ def test_astype_str_freq_and_name(self): expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -155,7 +158,7 @@ def test_astype_str_freq_and_tz(self): result = dti.astype(str) expected = Index( ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], - dtype=object, + dtype="str", name="test_name", ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 9c1ef302c5b51..ce05b5e9f2238 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -15,12 +15,12 @@ def test_astype_str_from_bytes(): # ensure_string_array which does f"{val}" idx = Index(["あ", b"a"], dtype="object") result = idx.astype(str) - expected = Index(["あ", "a"], dtype="object") + expected = Index(["あ", "a"], dtype="str") tm.assert_index_equal(result, expected) # while we're here, check that Series.astype behaves the same result = Series(idx).astype(str) - expected = Series(expected, dtype=object) + expected = Series(expected, dtype="str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 2e9ba007a45c1..ea3d068a673e8 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -7,7 +7,6 @@ NA, is_matching_na, ) -import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -160,14 +159,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: - # TODO(infer_string) parametrize over multiple string dtypes - @pytest.mark.parametrize( - "dtype", - [ - "object", - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], - ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -191,24 +182,22 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): - index = Index(list("bcdxy"), dtype=dtype) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=dtype) + expected = Index(list(expected), dtype=any_string_dtype) tm.assert_index_equal(result, expected) - # TODO(infer_string) parametrize over multiple string dtypes - @td.skip_if_no("pyarrow") - def test_slice_locs_negative_step_oob(self): - index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) result = index[-10:5:1] tm.assert_index_equal(result, index) result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") + expected = Index(list("yxdcb"), dtype=any_string_dtype) tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index d545bfd2fae0f..af3c2667f51b4 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -22,7 +22,7 @@ def test_astype_raises(self, dtype): with pytest.raises(TypeError, match=msg): idx.astype(dtype) - def test_astype_conversion(self): + def test_astype_conversion(self, using_infer_string): # GH#13149, GH#13209 idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") @@ -41,7 +41,12 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="Y", name="idx") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7ec66100b7291..2b62b384930d6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -76,9 +76,6 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_constructor_copy(self, using_infer_string): index = Index(list("abc"), name="name") arr = np.array(index) @@ -343,11 +340,6 @@ def test_constructor_empty_special(self, empty, klass): def test_view_with_args(self, index): index.view("i8") - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.parametrize( "index", [ @@ -364,7 +356,8 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "string": + elif index.dtype == "str" and not index.dtype.storage == "python": + # TODO(infer_string): Make the errors consistent with pytest.raises(NotImplementedError, match="i8"): index.view("i8") else: @@ -940,10 +933,9 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) - @td.skip_if_no("pyarrow") - def test_isin_arrow_string_null(self): + def test_isin_string_null(self, string_dtype_no_object): # GH#55821 - index = Index(["a", "b"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b"], dtype=string_dtype_no_object) result = index.isin([None]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index b41871ee921fd..cd3d599abd30e 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,10 +6,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp -from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ( is_integer_dtype, @@ -28,6 +25,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, isna, period_range, @@ -229,7 +227,6 @@ def test_logical_compat(self, simple_index): with pytest.raises(TypeError, match=msg): idx.any() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): pytest.skip(f"Not a valid repr for {type(simple_index).__name__}") @@ -246,11 +243,6 @@ def test_repr_max_seq_item_setting(self, simple_index): repr(idx) assert "..." not in str(idx) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured @@ -296,12 +288,17 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._mask, result._values._mask, check_same="same" ) - elif index.dtype == "string[python]": + elif ( + isinstance(index.dtype, StringDtype) and index.dtype.storage == "python" + ): assert np.shares_memory(index._values._ndarray, result._values._ndarray) tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" ) - elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"): + elif ( + isinstance(index.dtype, StringDtype) + and index.dtype.storage == "pyarrow" + ): assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) @@ -444,11 +441,7 @@ def test_insert_base(self, index): result = trimmed.insert(0, index[0]) assert index[0:4].equals(result) - @pytest.mark.skipif( - using_string_dtype(), - reason="completely different behavior, tested elsewher", - ) - def test_insert_out_of_bounds(self, index): + def test_insert_out_of_bounds(self, index, using_infer_string): # TypeError/IndexError matches what np.insert raises in these cases if len(index) > 0: @@ -460,6 +453,12 @@ def test_insert_out_of_bounds(self, index): msg = "index (0|0.5) is out of bounds for axis 0 with size 0" else: msg = "slice indices must be integers or None or have an __index__ method" + + if using_infer_string and ( + index.dtype == "string" or index.dtype == "category" # noqa: PLR1714 + ): + msg = "loc must be an integer between" + with pytest.raises(err, match=msg): index.insert(0.5, "foo") diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 8fd349dacf9e9..e5dc47be20677 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -246,9 +246,6 @@ def test_intersection_base(self, index): with pytest.raises(TypeError, match=msg): first.intersection([1, 2, 3]) - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_base(self, index): index = index.unique() @@ -276,9 +273,6 @@ def test_union_base(self, index): first.union([1, 2, 3]) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) def test_difference_base(self, sort, index): first = index[2:] second = index[:4] @@ -305,9 +299,6 @@ def test_difference_base(self, sort, index): first.difference([1, 2, 3], sort) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) def test_symmetric_difference(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") @@ -529,9 +520,6 @@ def test_intersection_difference_match_empty(self, index, sort): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -@pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" -) @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 311f2b5c9aa59..5166cadae499e 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -44,7 +44,7 @@ def test_astype_object_with_nat(self): tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list - def test_astype(self): + def test_astype(self, using_infer_string): # GH 13149, GH 13209 idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") @@ -61,7 +61,12 @@ def test_astype(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index b05b5d3dea2dc..dc95e1bb1b8a0 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas import ( @@ -1198,22 +1196,25 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_iloc_setitem_multicolumn_to_datetime(self): + def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) - df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) - expected = DataFrame( - { - "A": [ - Timestamp("2021-01-01 00:00:00"), - Timestamp("2022-01-01 00:00:00"), - ], - "B": ["2021", "2022"], - } - ) - tm.assert_frame_equal(df, expected, check_dtype=False) + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + else: + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + expected = DataFrame( + { + "A": [ + Timestamp("2021-01-01 00:00:00"), + Timestamp("2022-01-01 00:00:00"), + ], + "B": ["2021", "2022"], + } + ) + tm.assert_frame_equal(df, expected, check_dtype=False) class TestILocErrors: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index f7ada06e3ecb2..fb7e6649c534f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -528,12 +526,12 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object) df = df_orig.copy() @@ -543,9 +541,9 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + expected[list("CDG")] = expected[list("CDG")].astype(object) + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -554,18 +552,16 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e007b8c4e97ac..36b08ee1df790 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,6 +1,7 @@ """test label based indexing with loc""" from collections import namedtuple +import contextlib from datetime import ( date, datetime, @@ -13,10 +14,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import index as libindex -from pandas.compat import HAS_PYARROW from pandas.errors import IndexingError import pandas as pd @@ -615,8 +613,7 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_loc_setitem_consistency_slice_column_len(self): + def test_loc_setitem_consistency_slice_column_len(self, using_infer_string): # .loc[:,column] setting with slice == len of the column # GH10408 levels = [ @@ -640,12 +637,23 @@ def test_loc_setitem_consistency_slice_column_len(self): ] df = DataFrame(values, index=mi, columns=cols) - df.loc[:, ("Respondent", "StartDate")] = to_datetime( - df.loc[:, ("Respondent", "StartDate")] - ) - df.loc[:, ("Respondent", "EndDate")] = to_datetime( - df.loc[:, ("Respondent", "EndDate")] - ) + ctx = contextlib.nullcontext() + if using_infer_string: + ctx = pytest.raises(TypeError, match="Invalid value") + + with ctx: + df.loc[:, ("Respondent", "StartDate")] = to_datetime( + df.loc[:, ("Respondent", "StartDate")] + ) + with ctx: + df.loc[:, ("Respondent", "EndDate")] = to_datetime( + df.loc[:, ("Respondent", "EndDate")] + ) + + if using_infer_string: + # infer-objects won't infer stuff anymore + return + df = df.infer_objects() # Adding a new key @@ -1211,20 +1219,23 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") - def test_loc_setitem_str_to_small_float_conversion_type(self): + def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string): # GH#20388 col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)] result = DataFrame(col_data, columns=["A"]) - expected = DataFrame(col_data, columns=["A"], dtype=object) + expected = DataFrame(col_data, columns=["A"]) tm.assert_frame_equal(result, expected) # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful - result.loc[result.index, "A"] = [float(x) for x in col_data] - expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) - tm.assert_frame_equal(result, expected) + if using_infer_string: + with pytest.raises(TypeError, match="Must provide strings"): + result.loc[result.index, "A"] = [float(x) for x in col_data] + else: + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) + tm.assert_frame_equal(result, expected) # assigning the entire column using __setitem__ swaps in the new array # GH#??? @@ -1389,9 +1400,6 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 76910db941d36..29ce9d0c03111 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -401,6 +401,7 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) @@ -465,7 +466,7 @@ def test_non_str_names_w_duplicates(): ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), ([True, False, None], "boolean", "bool"), ([True, False, None], "boolean[pyarrow]", "bool"), - (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"), (["much ado", "about", None], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), None], @@ -528,7 +529,11 @@ def test_pandas_nullable_with_missing_values( ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), ([True, False, False], "boolean", "bool"), ([True, False, False], "boolean[pyarrow]", "bool"), - (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), + ( + ["much ado", "about", "nothing"], + pd.StringDtype(na_value=np.nan), + "large_string", + ), (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b831ec3bb2c6a..3989e022dbbd2 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -587,7 +587,7 @@ def test_reader_dtype(self, read_ext): expected["a"] = expected["a"].astype("float64") expected["b"] = expected["b"].astype("float32") - expected["c"] = Series(["001", "002", "003", "004"], dtype=object) + expected["c"] = Series(["001", "002", "003", "004"], dtype="str") tm.assert_frame_equal(actual, expected) msg = "Unable to convert column d to type int64" @@ -611,8 +611,8 @@ def test_reader_dtype(self, read_ext): { "a": Series([1, 2, 3, 4], dtype="float64"), "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": Series(["001", "002", "003", "004"], dtype=object), - "d": Series(["1", "2", np.nan, "4"], dtype=object), + "c": Series(["001", "002", "003", "004"], dtype="str"), + "d": Series(["1", "2", np.nan, "4"], dtype="str"), }, ), ], diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 89addbbbc1ded..e9fc2b2d27afd 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -886,8 +886,19 @@ def test_maybe_convert_css_to_tuples(self): expected = [] assert maybe_convert_css_to_tuples("") == expected + # issue #59623 + expected = [("a", "b"), ("c", "url('data:123')")] + assert maybe_convert_css_to_tuples("a:b;c: url('data:123');") == expected + + # if no value, return attr and empty string + expected = [("a", ""), ("c", "")] + assert maybe_convert_css_to_tuples("a:;c: ") == expected + def test_maybe_convert_css_to_tuples_err(self): - msg = "Styles supplied as string must follow CSS rule formats" + msg = ( + "Styles supplied as string must follow CSS rule formats, " + "for example 'attr: val;'. 'err' was given." + ) with pytest.raises(ValueError, match=msg): maybe_convert_css_to_tuples("err") diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index af7b04d66096a..0dc16e1ebc723 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -368,6 +368,40 @@ def test_repr_min_rows(self): assert ".." not in repr(df) assert ".." not in df._repr_html_() + @pytest.mark.parametrize( + "data, format_option, expected_values", + [ + (12345.6789, "{:12.3f}", "12345.679"), + (None, "{:.3f}", "None"), + ("", "{:.2f}", ""), + (112345.6789, "{:6.3f}", "112345.679"), + ("foo foo", None, "foo      foo"), + (" foo", None, "foo"), + ( + "foo foo foo", + None, + "foo foo       foo", + ), # odd no.of spaces + ( + "foo foo foo", + None, + "foo foo    foo", + ), # even no.of spaces + ], + ) + def test_repr_float_formatting_html_output( + self, data, format_option, expected_values + ): + if format_option is not None: + with option_context("display.float_format", format_option.format): + df = DataFrame({"A": [data]}) + html_output = df._repr_html_() + assert expected_values in html_output + else: + df = DataFrame({"A": [data]}) + html_output = df._repr_html_() + assert expected_values in html_output + def test_str_max_colwidth(self): # GH 7856 df = DataFrame( diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 3d07c0219691e..d3328d1dfcaef 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2245,18 +2245,18 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) out = df.to_json() with pd.option_context("future.infer_string", True): result = read_json(StringIO(out)) + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype="string[pyarrow_numpy]", - index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), - columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), + dtype=dtype, + index=Index(["row 1", "row 2"], dtype=dtype), + columns=Index(["col 1", "col 2"], dtype=dtype), ) tm.assert_frame_equal(result, expected) @@ -2286,3 +2286,15 @@ def test_read_json_lines_rangeindex(): result = read_json(StringIO(data), lines=True).index expected = RangeIndex(2) tm.assert_index_equal(result, expected, exact=True) + + +def test_large_number(): + # GH#20608 + result = read_json( + StringIO('["9999999999999999"]'), + orient="values", + typ="series", + convert_dates=False, + ) + expected = Series([9999999999999999]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 07f29518b7881..e02562ac8d93d 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -31,7 +31,7 @@ @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_all_columns(all_parsers, dtype, check_orig): +def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): # see gh-3795, gh-6607 parser = all_parsers @@ -49,8 +49,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): if check_orig: expected = df.copy() result = result.astype(float) - else: + elif using_infer_string and dtype is str: expected = df.astype(str) + else: + expected = df.astype(str).astype(object) tm.assert_frame_equal(result, expected) @@ -300,7 +302,6 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): @@ -316,7 +317,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 @@ -547,8 +547,7 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) data = """a,b x,1 @@ -566,10 +565,8 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) -def test_string_inference_object_dtype(all_parsers, dtype): +def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): # GH#56047 - pytest.importorskip("pyarrow") - data = """a,b x,a y,a @@ -578,12 +575,13 @@ def test_string_inference_object_dtype(all_parsers, dtype): with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data), dtype=dtype) + expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=expected_dtype), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -592,10 +590,10 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 360a5feebe073..89645b526f2ee 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -667,7 +667,6 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): @@ -719,7 +718,6 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): # TODO: this test isn't about the na_values keyword, it is about the empty entries # being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers @@ -812,3 +810,21 @@ def test_bool_and_nan_to_float(all_parsers): result = parser.read_csv(StringIO(data), dtype="float") expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +@pytest.mark.parametrize( + "na_values", + [[-99.0, -99], [-99, -99.0]], +) +def test_na_values_dict_without_dtype(all_parsers, na_values): + parser = all_parsers + data = """A +-99 +-99 +-99.0 +-99.0""" + + result = parser.read_csv(StringIO(data), na_values=na_values) + expected = DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 26480010fc687..a5bb151e84f47 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( ParserError, ParserWarning, @@ -499,7 +497,6 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] ) @@ -524,10 +521,11 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d "c": [0, 4000, 131], } ) + if dtype["a"] == object: + expected["a"] = expected["a"].astype(object) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype,expected", [ diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index dd3a0eabe95ae..8ae87d4bab52d 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -310,7 +310,6 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -318,8 +317,8 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype="string[pyarrow_numpy]", - columns=Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a1f3babb1ae3b..9721d045b7b91 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -243,5 +243,7 @@ def test_string_inference(self, tmp_path): df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") + expected = pd.DataFrame( + data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 90133344fdfc9..efb3dffecd856 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -436,7 +436,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a29e479b7c9f1..4c2ea036f08dc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1109,8 +1109,8 @@ def test_string_inference(self, tmp_path, pa): result = read_parquet(path, engine="pyarrow") expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -1140,8 +1140,8 @@ def test_infer_string_large_string_type(self, tmp_path, pa): result = read_parquet(path) expected = pd.DataFrame( data={"a": [None, "b", "c"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 980c88f070b89..c28a33069d23f 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3809,7 +3809,6 @@ class Test(BaseModel): def test_read_sql_string_inference(sqlite_engine): conn = sqlite_engine # GH#54430 - pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=conn, index=False, if_exists="replace") @@ -3817,7 +3816,7 @@ def test_read_sql_string_inference(sqlite_engine): with pd.option_context("future.infer_string", True): result = read_sql_table(table, conn) - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 8583d8bcc052c..17dae1879f3b8 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -283,3 +285,15 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default + + +def test_ensure_string_array_copy(): + # ensure the original array is not modified in case of copy=False with + # pickle-roundtripped object dtype array + # https://github.com/pandas-dev/pandas/issues/54654 + arr = np.array(["a", None], dtype=object) + arr = pickle.loads(pickle.dumps(arr)) + result = lib.ensure_string_array(arr, copy=False) + assert not np.shares_memory(arr, result) + assert arr[1] is None + assert result[1] is np.nan diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 8af224f1ad64f..d3edee17366f7 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -10,8 +10,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import InvalidIndexError import pandas as pd @@ -47,18 +45,11 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) - # These are actual copies. - result = concat([df, df2, df3], axis=1) - for block in result._mgr.blocks: - assert block.values.base is not None - - # These are the same. result = concat([df, df2, df3], axis=1) for block in result._mgr.blocks: @@ -69,6 +60,8 @@ def test_concat_copy(self): assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: assert arr.base is not None + elif arr.dtype == "string": + tm.shares_memory(arr, df3._mgr.blocks[0].values) # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 8d972087b0dff..f7b0876c5a605 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -3064,12 +3062,8 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_merge_datatype_error_raises(self, using_infer_string): - if using_infer_string: - msg = "incompatible merge keys" - else: - msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" + def test_merge_datatype_error_raises(self): + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 27a34decae7b0..9ce2c925a368b 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -216,11 +214,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_dataframe_dummies_string_dtype(self, df, using_infer_string): + def test_dataframe_dummies_string_dtype(self, df, any_string_dtype): # GH44965 df = df[["A", "B"]] - df = df.astype({"A": "object", "B": "string"}) + df = df.astype({"A": "str", "B": any_string_dtype}) result = get_dummies(df) expected = DataFrame( { @@ -231,8 +228,7 @@ def test_dataframe_dummies_string_dtype(self, df, using_infer_string): }, dtype=bool, ) - if not using_infer_string: - # infer_string returns numpy bools + if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA: expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean") tm.assert_frame_equal(result, expected) @@ -712,19 +708,17 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype): ) tm.assert_frame_equal(result, expected) - @td.skip_if_no("pyarrow") - def test_get_dummies_ea_dtype(self): + @pytest.mark.parametrize("dtype_type", ["string", "category"]) + def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object): # GH#56273 - for dtype, exp_dtype in [ - ("string[pyarrow]", "boolean"), - ("string[pyarrow_numpy]", "bool"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"), - ]: - df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) - result = get_dummies(df) - expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) - tm.assert_frame_equal(result, expected) + dtype = string_dtype_no_object + exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool" + if dtype_type == "category": + dtype = CategoricalDtype(Index(["a"], dtype)) + df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) + tm.assert_frame_equal(result, expected) @td.skip_if_no("pyarrow") def test_get_dummies_arrow_dtype(self): diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index be4f2ab4d183d..95aa5291cb45a 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -21,7 +19,7 @@ def df(): res = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) res["id1"] = (res["A"] > 0).astype(np.int64) @@ -83,7 +81,6 @@ def test_default_col_names(self, df): result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 @@ -100,7 +97,6 @@ def test_value_vars(self, df): ) tm.assert_frame_equal(result4, expected4) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 @@ -178,7 +174,6 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] @@ -206,7 +201,6 @@ def test_custom_var_name(self, df, var_name): ) tm.assert_frame_equal(result9, expected9) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] @@ -236,7 +230,6 @@ def test_custom_value_name(self, df, value_name): ) tm.assert_frame_equal(result14, expected14) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] @@ -361,7 +354,6 @@ def test_melt_missing_columns_raises(self): with pytest.raises(KeyError, match=msg): df.melt(["A"], ["F"], col_level=0) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) @@ -369,6 +361,8 @@ def test_melt_mixed_int_str_id_vars(self): expected = DataFrame( {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} ) + # the df's columns are mixed type and thus object -> preserves object dtype + expected["variable"] = expected["variable"].astype(object) tm.assert_frame_equal(result, expected) def test_melt_mixed_int_str_value_vars(self): @@ -1222,12 +1216,10 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", ["O", "string"]) - def test_missing_stubname(self, dtype): + def test_missing_stubname(self, any_string_dtype): # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) - df = df.astype({"id": dtype}) + df = df.astype({"id": any_string_dtype}) result = wide_to_long( df, stubnames=["a", "b"], @@ -1243,15 +1235,16 @@ def test_missing_stubname(self, dtype): {"a": [100, 200, 300, 400], "b": [np.nan] * 4}, index=index, ) - new_level = expected.index.levels[0].astype(dtype) + new_level = expected.index.levels[0].astype(any_string_dtype) + if any_string_dtype == "object": + new_level = expected.index.levels[0].astype("str") expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_wide_to_long_pyarrow_string_columns(): +def test_wide_to_long_string_columns(string_storage): # GH 57066 - pytest.importorskip("pyarrow") + string_dtype = pd.StringDtype(string_storage, na_value=np.nan) df = DataFrame( { "ID": {0: 1}, @@ -1261,17 +1254,17 @@ def test_wide_to_long_pyarrow_string_columns(): "D": {0: 1}, } ) - df.columns = df.columns.astype("string[pyarrow_numpy]") + df.columns = df.columns.astype(string_dtype) result = wide_to_long( df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" ) expected = DataFrame( [[1, 1], [1, 1], [1, 2]], - columns=Index(["D", "R"], dtype=object), + columns=Index(["D", "R"]), index=pd.MultiIndex.from_arrays( [ [1, 1, 1], - Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"), + Index(["test1", "test2", "test3"], dtype=string_dtype), ], names=["ID", "UNPIVOTED"], ), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 8cfe565ebdd65..eccf676b87f89 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1068,7 +1068,6 @@ def test_margins_dtype_len(self, data): tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 @@ -1078,7 +1077,7 @@ def test_pivot_table_multiindex_only(self, cols): expected = DataFrame( [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"], dtype=object), + index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"), ) tm.assert_frame_equal(result, expected) @@ -2570,13 +2569,16 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", [object, "string"]) - def test_pivot_integer_bug(self, dtype): - df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) + def test_pivot_integer_bug(self, any_string_dtype): + df = DataFrame( + data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype + ) result = df.pivot(index=1, columns=0, values=2) - tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype)) + expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype) + if any_string_dtype == "object": + expected_columns = expected_columns.astype("str") + tm.assert_index_equal(result.columns, expected_columns) def test_pivot_index_none(self): # GH#3962 @@ -2658,7 +2660,9 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2674,7 +2678,9 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2688,7 +2694,9 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 9b9a8ea3600ae..885adb3543b46 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -790,7 +790,8 @@ def test_end_time_timevalues(self, input_vals): # GH#17157 # Check that the time part of the Period is adjusted by end_time # when using the dt accessor on a Series - input_vals = PeriodArray._from_sequence(np.asarray(input_vals)) + dtype = pd.PeriodDtype(input_vals[0].freq) + input_vals = PeriodArray._from_sequence(np.asarray(input_vals), dtype=dtype) ser = Series(input_vals) result = ser.dt.end_time diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 71ba2dab671ef..789e3ac752097 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -4,13 +4,17 @@ datetime, ) from decimal import Decimal +import os import numpy as np import pytest from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW +from pandas.compat import ( + HAS_PYARROW, + WASM, +) from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError @@ -1446,7 +1450,11 @@ def obj(self): marks=pytest.mark.xfail( ( not np_version_gte1p24 - or (np_version_gte1p24 and np._get_promotion_state() != "weak") + or ( + np_version_gte1p24 + and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak" + ) + or WASM ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 579d41f964df0..4a7e204ee4161 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str") tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -173,10 +173,14 @@ def test_astype_empty_constructor_equality(self, dtype): def test_astype_str_map(self, dtype, data, using_infer_string): # see GH#4405 series = Series(data) + using_string_dtype = using_infer_string and dtype is str result = series.astype(dtype) - expected = series.map(str) - if using_infer_string: - expected = expected.astype(object) + if using_string_dtype: + expected = series.map(lambda val: str(val) if val is not np.nan else np.nan) + else: + expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -213,7 +217,7 @@ def test_astype_dt64_to_str(self): # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex dti = date_range("2012-01-01", periods=3) result = Series(dti).astype(str) - expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str") tm.assert_series_equal(result, expected) def test_astype_dt64tz_to_str(self): @@ -226,7 +230,7 @@ def test_astype_dt64tz_to_str(self): "2012-01-02 00:00:00-05:00", "2012-01-03 00:00:00-05:00", ], - dtype=object, + dtype="str", ) tm.assert_series_equal(result, expected) @@ -286,13 +290,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"], dtype=object) + expected = Series(["2010-01-04"], dtype="str") tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str") tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -301,7 +305,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="D")]) ser = td.astype(str) - expected = Series(["1 days"], dtype=object) + expected = Series(["1 days"], dtype="str") tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -347,7 +351,7 @@ def test_astype_from_float_to_str(self, any_float_dtype): # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=any_float_dtype) result = ser.astype(str) - expected = Series(["0.1"], dtype=object) + expected = Series(["0.1"], dtype="str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, any_float_dtype): (NA, ""), ], ) - def test_astype_to_str_preserves_na(self, value, string_value): + def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string): # https://github.com/pandas-dev/pandas/issues/36904 ser = Series(["a", "b", value], dtype=object) result = ser.astype(str) - expected = Series(["a", "b", string_value], dtype=object) + expected = Series( + ["a", "b", None if using_infer_string else string_value], dtype="str" + ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index b94723b7cbddf..0c52eacd7e516 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -1,11 +1,9 @@ -from contextlib import nullcontext import copy import numpy as np import pytest from pandas._libs.missing import is_matching_na -from pandas.compat.numpy import np_version_gte1p25 from pandas.core.dtypes.common import is_float @@ -14,7 +12,6 @@ MultiIndex, Series, ) -import pandas._testing as tm @pytest.mark.parametrize( @@ -48,14 +45,7 @@ def test_equals_list_array(val): assert s1.equals(s2) s1[1] = val - - cm = ( - tm.assert_produces_warning(FutureWarning, check_stacklevel=False) - if isinstance(val, str) and not np_version_gte1p25 - else nullcontext() - ) - with cm: - assert not s1.equals(s2) + assert not s1.equals(s2) def test_equals_false_negative(): diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index fe84ffafa70b4..84b60a2afe6eb 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -549,13 +549,11 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp, using_infer_string): +def test_map_missing_mixed(vals, mapping, exp): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) exp = Series(exp) - if using_infer_string and mapping == {np.nan: "not NaN"}: - exp.iloc[-1] = np.nan tm.assert_series_equal(result, exp) @@ -599,3 +597,10 @@ def test_map_type(): result = s.map(type) expected = Series([int, str, type], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + + +def test_map_kwargs(): + # GH 59814 + result = Series([2, 4, 5]).map(lambda x, y: x + y, y=2) + expected = Series([4, 6, 7]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 2d7fde130ce70..7c6a7893ba3a0 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -33,7 +33,8 @@ def ser(): ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])], ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])], ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])], - ] + ], + ids=lambda x: x[0], ) def results(request): return request.param @@ -48,12 +49,29 @@ def results(request): "Int64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + "string[python]", + "str", ] ) def dtype(request): return request.param +def expected_dtype(dtype, method, pct=False): + exp_dtype = "float64" + # elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]: + if dtype in ["string[pyarrow]"]: + exp_dtype = "Float64" + elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]: + if method == "average" or pct: + exp_dtype = "double[pyarrow]" + else: + exp_dtype = "uint64[pyarrow]" + + return exp_dtype + + class TestSeriesRank: def test_rank(self, datetime_series): sp_stats = pytest.importorskip("scipy.stats") @@ -251,12 +269,14 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - @pytest.mark.parametrize("dtype", [None, object]) - def test_rank_tie_methods(self, ser, results, dtype): + def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results + if dtype == "int64" or (not using_infer_string and dtype == "str"): + pytest.skip("int64/str does not support NaN") + ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) - tm.assert_series_equal(result, Series(exp)) + tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @pytest.mark.parametrize( @@ -357,25 +377,35 @@ def test_rank_methods_series(self, rank_method, op, value): ], ) def test_rank_dense_method(self, dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense") - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense")) tm.assert_series_equal(result, expected) - def test_rank_descending(self, ser, results, dtype): + def test_rank_descending(self, ser, results, dtype, using_infer_string): method, _ = results - if "i" in dtype: + if dtype == "int64" or (not using_infer_string and dtype == "str"): s = ser.dropna() else: s = ser.astype(dtype) res = s.rank(ascending=False) - expected = (s.max() - s).rank() - tm.assert_series_equal(res, expected) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank() + else: + expected = (s.max() - s).rank() + tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average"))) - expected = (s.max() - s).rank(method=method) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank( + method=method + ) + else: + expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) - tm.assert_series_equal(res2, expected) + tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method))) def test_rank_int(self, ser, results): method, exp = results @@ -432,9 +462,11 @@ def test_rank_ea_small_values(self): ], ) def test_rank_dense_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True)) tm.assert_series_equal(result, expected) @@ -453,9 +485,11 @@ def test_rank_dense_pct(dtype, ser, exp): ], ) def test_rank_min_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="min", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True)) tm.assert_series_equal(result, expected) @@ -474,9 +508,11 @@ def test_rank_min_pct(dtype, ser, exp): ], ) def test_rank_max_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="max", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True)) tm.assert_series_equal(result, expected) @@ -495,9 +531,11 @@ def test_rank_max_pct(dtype, ser, exp): ], ) def test_rank_average_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="average", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True)) tm.assert_series_equal(result, expected) @@ -516,9 +554,11 @@ def test_rank_average_pct(dtype, ser, exp): ], ) def test_rank_first_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="first", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_arrow_interface.py b/pandas/tests/series/test_arrow_interface.py index 34a2a638e4185..e73cf9bee6aeb 100644 --- a/pandas/tests/series/test_arrow_interface.py +++ b/pandas/tests/series/test_arrow_interface.py @@ -21,3 +21,41 @@ def test_series_arrow_interface(): ca = pa.chunked_array(s) expected = pa.chunked_array([[1, 4, 2]]) assert ca.equals(expected) + ca = pa.chunked_array(s, type=pa.int32()) + expected = pa.chunked_array([[1, 4, 2]], type=pa.int32()) + assert ca.equals(expected) + + +def test_series_arrow_interface_arrow_dtypes(): + s = pd.Series([1, 4, 2], dtype="Int64[pyarrow]") + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([[1, 4, 2]]) + assert ca.equals(expected) + ca = pa.chunked_array(s, type=pa.int32()) + expected = pa.chunked_array([[1, 4, 2]], type=pa.int32()) + assert ca.equals(expected) + + +def test_series_arrow_interface_stringdtype(): + s = pd.Series(["foo", "bar"], dtype="string[pyarrow]") + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([["foo", "bar"]], type=pa.large_string()) + assert ca.equals(expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1771a4dfdb71f..69f42b5e42878 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -229,7 +229,7 @@ def test_constructor_empty(self, input_class, using_infer_string): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) if using_infer_string: - empty2 = Series("", index=range(3), dtype=object) + empty2 = Series("", index=range(3), dtype="str") else: empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 1586195e79a9d..8516018e8aa93 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -9,6 +9,7 @@ DataFrame, Index, Series, + StringDtype, bdate_range, ) import pandas._testing as tm @@ -514,7 +515,7 @@ def test_pyarrow_numpy_string_invalid(self): # GH#56008 pa = pytest.importorskip("pyarrow") ser = Series([False, True]) - ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan)) result = ser == ser2 expected_eq = Series(False, index=ser.index) tm.assert_series_equal(result, expected_eq) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 36a2afb2162c2..a5976bb2518c9 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -16,7 +16,10 @@ def ufunc(request): return request.param -@pytest.fixture(params=[True, False], ids=["sparse", "dense"]) +@pytest.fixture( + params=[pytest.param(True, marks=pytest.mark.fails_arm_wheels), False], + ids=["sparse", "dense"], +) def sparse(request): return request.param diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index bf01c4996bb32..f3698a2ea33cf 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -21,10 +21,6 @@ # -------------------------------------------------------------------------------------- -def using_pyarrow(dtype): - return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") - - def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -458,13 +454,10 @@ def test_replace_mixed_object(): tm.assert_series_equal(result, expected) -def test_replace_unicode(any_string_dtype, performance_warning): +def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) + result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -478,16 +471,13 @@ def test_replace_wrong_repl_type_raises(any_string_dtype, index_or_series, repl, obj.str.replace("a", repl) -def test_replace_callable(any_string_dtype, performance_warning): +def test_replace_callable(any_string_dtype): # GH 15055 ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with callable repl = lambda m: m.group(0).swapcase() - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) + result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -495,7 +485,7 @@ def test_replace_callable(any_string_dtype, performance_warning): @pytest.mark.parametrize( "repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None] ) -def test_replace_callable_raises(any_string_dtype, performance_warning, repl): +def test_replace_callable_raises(any_string_dtype, repl): # GH 15055 values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) @@ -504,43 +494,31 @@ def test_replace_callable_raises(any_string_dtype, performance_warning, repl): r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " r"(?(3)required )positional arguments?" ) - if not using_pyarrow(any_string_dtype): - performance_warning = False with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(performance_warning): - values.str.replace("a", repl, regex=True) + values.str.replace("a", repl, regex=True) -def test_replace_callable_named_groups(any_string_dtype, performance_warning): +def test_replace_callable_named_groups(any_string_dtype): # test regex named groups ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(pat, repl, regex=True) + result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) -def test_replace_compiled_regex(any_string_dtype, performance_warning): +def test_replace_compiled_regex(any_string_dtype): # GH 15446 ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with compiled regex pat = re.compile(r"BAD_*") - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(pat, "", regex=True) + result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(pat, "", n=1, regex=True) + result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -557,14 +535,11 @@ def test_replace_compiled_regex_mixed_object(): tm.assert_series_equal(result, expected) -def test_replace_compiled_regex_unicode(any_string_dtype, performance_warning): +def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(pat, ", ", regex=True) + result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -586,15 +561,12 @@ def test_replace_compiled_regex_raises(any_string_dtype): ser.str.replace(pat, "", case=True, regex=True) -def test_replace_compiled_regex_callable(any_string_dtype, performance_warning): +def test_replace_compiled_regex_callable(any_string_dtype): # test with callable ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(pat, repl, n=2, regex=True) + result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -626,7 +598,7 @@ def test_replace_literal_compiled_raises(any_string_dtype): ser.str.replace(pat, "", regex=False) -def test_replace_moar(any_string_dtype, performance_warning): +def test_replace_moar(any_string_dtype): # PR #1179 ser = Series( ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], @@ -640,10 +612,7 @@ def test_replace_moar(any_string_dtype, performance_warning): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace("A", "YYY", case=False) + result = ser.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", @@ -661,10 +630,7 @@ def test_replace_moar(any_string_dtype, performance_warning): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) + result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A", @@ -683,21 +649,15 @@ def test_replace_moar(any_string_dtype, performance_warning): tm.assert_series_equal(result, expected) -def test_replace_not_case_sensitive_not_regex(any_string_dtype, performance_warning): +def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace("a", "c", case=False, regex=False) + result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace("a.", "c.", case=False, regex=False) + result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -853,7 +813,7 @@ def test_fullmatch_na_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) -def test_fullmatch_case_kwarg(any_string_dtype, performance_warning): +def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) expected_dtype = ( np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" @@ -869,10 +829,7 @@ def test_fullmatch_case_kwarg(any_string_dtype, performance_warning): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.fullmatch("ab", flags=re.IGNORECASE) + result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -1046,7 +1003,7 @@ def test_translate_mixed_object(): # -------------------------------------------------------------------------------------- -def test_flags_kwarg(any_string_dtype, performance_warning): +def test_flags_kwarg(any_string_dtype): data = { "Dave": "dave@google.com", "Steve": "steve@gmail.com", @@ -1057,17 +1014,13 @@ def test_flags_kwarg(any_string_dtype, performance_warning): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - use_pyarrow = using_pyarrow(any_string_dtype) - result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with tm.maybe_produces_warning(performance_warning, use_pyarrow): - result = data.str.match(pat, flags=re.IGNORECASE) + result = data.str.match(pat, flags=re.IGNORECASE) assert result.iloc[0] - with tm.maybe_produces_warning(performance_warning, use_pyarrow): - result = data.str.fullmatch(pat, flags=re.IGNORECASE) + result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result.iloc[0] result = data.str.findall(pat, flags=re.IGNORECASE) @@ -1077,8 +1030,6 @@ def test_flags_kwarg(any_string_dtype, performance_warning): assert result.iloc[0] == 1 msg = "has match groups" - with tm.assert_produces_warning( - UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow - ): + with tm.assert_produces_warning(UserWarning, match=msg): result = data.str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0] diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 31386e4e342ae..3b989e284ca25 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,4 +1,9 @@ import numpy as np +import pytest + +from pandas._config import using_string_dtype + +import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -8,6 +13,11 @@ _testing as tm, ) +try: + import pyarrow as pa +except ImportError: + pa = None + def test_get_dummies(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) @@ -32,22 +42,86 @@ def test_get_dummies_index(): tm.assert_index_equal(result, expected) -def test_get_dummies_with_name_dummy(any_string_dtype): - # GH 12180 - # Dummies named 'name' should work as expected - s = Series(["a", "b,name", "b"], dtype=any_string_dtype) - result = s.str.get_dummies(",") - expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"]) +# GH#47872 +@pytest.mark.parametrize( + "dtype", + [ + np.uint8, + np.int16, + np.uint16, + np.int32, + np.uint32, + np.int64, + np.uint64, + bool, + "Int8", + "Int16", + "Int32", + "Int64", + "boolean", + ], +) +def test_get_dummies_with_dtype(any_string_dtype, dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=dtype) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype + ) tm.assert_frame_equal(result, expected) -def test_get_dummies_with_name_dummy_index(): - # GH 12180 - # Dummies named 'name' should work as expected - idx = Index(["a|b", "name|c", "b|name"]) - result = idx.str.get_dummies("|") +# GH#47872 +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize( + "dtype", + [ + "int8[pyarrow]", + "uint8[pyarrow]", + "int16[pyarrow]", + "uint16[pyarrow]", + "int32[pyarrow]", + "uint32[pyarrow]", + "int64[pyarrow]", + "uint64[pyarrow]", + "bool[pyarrow]", + ], +) +def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=dtype) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=dtype, + ) + tm.assert_frame_equal(result, expected) - expected = MultiIndex.from_tuples( - [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") + +# GH#47872 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +def test_get_dummies_with_str_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=str) + expected = DataFrame( + [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], + columns=list("abc"), + dtype=str, ) - tm.assert_index_equal(result, expected) + tm.assert_frame_equal(result, expected) + + +# GH#47872 +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pa_str_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype="str[pyarrow]") + expected = DataFrame( + [ + ["true", "true", "false"], + ["true", "false", "true"], + ["false", "false", "false"], + ], + columns=list("abc"), + dtype="str[pyarrow]", + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 0b3f368afea5e..517ddb164985c 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -12,7 +12,6 @@ ) -@pytest.mark.filterwarnings("ignore:Falling back") def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1ce46497c3c22..4995b448f7e94 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -394,6 +394,7 @@ def test_pipe_failures(any_string_dtype): (2, 5, None, ["foo", "bar", np.nan, "baz"]), (0, 3, -1, ["", "", np.nan, ""]), (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), + (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]), (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), ], diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 06fd81ed722d9..dac74a0e32a42 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1877,13 +1877,16 @@ def test_strobj_mode(self): tm.assert_series_equal(ser.mode(), exp) @pytest.mark.parametrize("dt", [str, object]) - def test_strobj_multi_char(self, dt): + def test_strobj_multi_char(self, dt, using_infer_string): exp = ["bar"] data = ["foo"] * 2 + ["bar"] * 3 ser = Series(data, dtype=dt) exp = Series(exp, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + if using_infer_string and dt is str: + tm.assert_extension_array_equal(algos.mode(ser.values), exp.values) + else: + tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) tm.assert_series_equal(ser.mode(), exp) def test_datelike_mode(self): diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 585b7ca94f730..f3645bf0649bd 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -384,6 +384,21 @@ def test_timedelta(transform_assert_equal): assert_equal(result, expected) +@pytest.mark.parametrize( + "scalar", + [ + pd.Timedelta(1, "D"), + pd.Timestamp("2017-01-01T12"), + pd.Timestamp("2017-01-01T12", tz="US/Pacific"), + ], +) +def test_timedelta_timestamp_scalar(scalar): + # GH#59944 + result = to_numeric(scalar) + expected = to_numeric(Series(scalar))[0] + assert result == expected + + def test_period(request, transform_assert_equal): transform, assert_equal = transform_assert_equal diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 9b64beaf09273..07425af8ed37a 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -37,10 +37,13 @@ ) def test_parsing_tzlocal_deprecated(): # GH#50791 - msg = ( - r"Parsing 'EST' as tzlocal \(dependent on system timezone\) " - r"is no longer supported\. " - "Pass the 'tz' keyword or call tz_localize after construction instead" + msg = "|".join( + [ + r"Parsing 'EST' as tzlocal \(dependent on system timezone\) " + r"is no longer supported\. " + "Pass the 'tz' keyword or call tz_localize after construction instead", + ".*included an un-recognized timezone", + ] ) dtstr = "Jan 15 2004 03:00 EST" diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index bcc2e4e03f367..091670ed69f11 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -534,6 +534,10 @@ def test_assert_almost_equal_iterable_values_mismatch(): np.array([np.array([1, 2, 3]), np.array([4, 5])], dtype=object), np.array([[1, 2, 3], [4, 5]], dtype=object), ), + ( + np.array([np.array([], dtype=object), None], dtype=object), + np.array([[], None], dtype=object), + ), ( np.array( [ diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index 00a897d574a07..8f1ac93b40247 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas.util._test_decorators as td import pandas as pd @@ -20,10 +22,10 @@ def test_shares_memory_string(): # GH#55823 import pyarrow as pa - obj = pd.array(["a", "b"], dtype="string[pyarrow]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA)) assert tm.shares_memory(obj, obj) - obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan)) assert tm.shares_memory(obj, obj) obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string())) diff --git a/pyproject.toml b/pyproject.toml index 645ded35f3d18..d0fcdc4b21b33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,16 +156,23 @@ test-command = """ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ free-threaded-support = true -before-build = "bash {package}/scripts/cibw_before_build.sh" -before-test = "bash {package}/scripts/cibw_before_test.sh" +before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh" [tool.cibuildwheel.windows] before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" +[[tool.cibuildwheel.overrides]] +select = "*-manylinux_aarch64*" +test-command = """ + PANDAS_CI='1' python -c 'import pandas as pd; \ + pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db and not fails_arm_wheels", "-n 2", "--no-strict-data-files"]); \ + pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ + """ + [[tool.cibuildwheel.overrides]] select = "*-musllinux*" -before-test = "apk update && apk add musl-locales && bash {package}/scripts/cibw_before_test.sh" +before-test = "apk update && apk add musl-locales" [[tool.cibuildwheel.overrides]] select = "*-win*" @@ -478,6 +485,10 @@ markers = [ "clipboard: mark a pd.read_clipboard test", "arm_slow: mark a test as slow for arm64 architecture", "skip_ubsan: Tests known to fail UBSAN check", + # TODO: someone should investigate this ... + # these tests only fail in the wheel builder and don't fail in regular + # ARM CI + "fails_arm_wheels: Tests that fail in the ARM wheel build only", ] [tool.mypy] diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh index f3049b27ed5d1..679b91e3280ec 100644 --- a/scripts/cibw_before_build.sh +++ b/scripts/cibw_before_build.sh @@ -1,8 +1,11 @@ -# TODO: Delete when there's PyPI NumPy/Cython releases the support Python 3.13. -# If free-threading support is not included in those releases, this script will have -# to whether this runs for a free-threaded build instead. -PYTHON_VERSION="$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")" -if [[ $PYTHON_VERSION == "313" ]]; then +# Add 3rd party licenses, like numpy does +for file in $PACKAGE_DIR/LICENSES/*; do + cat $file >> $PACKAGE_DIR/LICENSE +done + +# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13. +FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" +if [[ $FREE_THREADED_BUILD == "True" ]]; then python -m pip install -U pip python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython python -m pip install ninja meson-python versioneer[toml] diff --git a/scripts/cibw_before_test.sh b/scripts/cibw_before_test.sh deleted file mode 100644 index 7d1b143881ced..0000000000000 --- a/scripts/cibw_before_test.sh +++ /dev/null @@ -1,8 +0,0 @@ -# TODO: Delete when there's PyPI NumPy/Cython releases the support Python 3.13. -# If free-threading support is not included in those releases, this script will have -# to whether this runs for a free-threaded build instead. -PYTHON_VERSION="$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")" -if [[ $PYTHON_VERSION == "313" ]]; then - python -m pip install -U pip - python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy -fi diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 35f6ffb4980df..076acc359f933 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -29,8 +29,6 @@ "_shared_docs", "_new_Index", "_new_PeriodIndex", - "_agg_template_series", - "_agg_template_frame", "_pipe_template", "_apply_groupings_depr", "__main__", diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 73a3cb6429790..2ea10954fc929 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -367,6 +367,97 @@ pandas-gbq provides high performance reads and writes to and from these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`. Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead. + +### [ArcticDB](https://github.com/man-group/ArcticDB) + +ArcticDB is a serverless DataFrame database engine designed for the Python Data Science ecosystem. ArcticDB enables you to store, retrieve, and process pandas DataFrames at scale. It is a storage engine designed for object storage and also supports local-disk storage using LMDB. ArcticDB requires zero additional infrastructure beyond a running Python environment and access to object storage and can be installed in seconds. Please find full documentation [here](https://docs.arcticdb.io/latest/). + +#### ArcticDB Terminology + +ArcticDB is structured to provide a scalable and efficient way to manage and retrieve DataFrames, organized into several key components: + +- `Object Store` Collections of libraries. Used to separate logical environments from each other. Analogous to a database server. +- `Library` Contains multiple symbols which are grouped in a certain way (different users, markets, etc). Analogous to a database. +- `Symbol` Atomic unit of data storage. Identified by a string name. Data stored under a symbol strongly resembles a pandas DataFrame. Analogous to tables. +- `Version` Every modifying action (write, append, update) performed on a symbol creates a new version of that object. + +#### Installation + +To install, simply run: + +```console +pip install arcticdb +``` + +To get started, we can import ArcticDB and instantiate it: + +```python +import arcticdb as adb +import numpy as np +import pandas as pd +# this will set up the storage using the local file system +arctic = adb.Arctic("lmdb://arcticdb_test") +``` + +> **Note:** ArcticDB supports any S3 API compatible storage, including AWS. ArcticDB also supports Azure Blob storage. +> ArcticDB also supports LMDB for local/file based storage - to use LMDB, pass an LMDB path as the URI: `adb.Arctic('lmdb://path/to/desired/database')`. + +#### Library Setup + +ArcticDB is geared towards storing many (potentially millions) of tables. Individual tables (DataFrames) are called symbols and are stored in collections called libraries. A single library can store many symbols. Libraries must first be initialized prior to use: + +```python +lib = arctic.get_library('sample', create_if_missing=True) +``` + +#### Writing Data to ArcticDB + +Now we have a library set up, we can get to reading and writing data. ArcticDB has a set of simple functions for DataFrame storage. Let's write a DataFrame to storage. + +```python +df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3) + } +) + +df +df.dtypes +``` + +Write to ArcticDB. + +```python +write_record = lib.write("test", df) +``` + +> **Note:** When writing pandas DataFrames, ArcticDB supports the following index types: +> +> - `pandas.Index` containing int64 (or the corresponding dedicated types Int64Index, UInt64Index) +> - `RangeIndex` +> - `DatetimeIndex` +> - `MultiIndex` composed of above supported types +> +> The "row" concept in `head`/`tail` refers to the row number ('iloc'), not the value in the `pandas.Index` ('loc'). + +#### Reading Data from ArcticDB + +Read the data back from storage: + +```python +read_record = lib.read("test") +read_record.data +df.dtypes +``` + +ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/). + + ## Out-of-core ### [Bodo](https://bodo.ai/) diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 74e7fda2e7983..a49aadd45204a 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -89,7 +89,6 @@ maintainers: - phofl - attack68 - fangchenli - - twoertwein - lithomas1 - lukemanley - noatamir @@ -108,6 +107,7 @@ maintainers: - wesm - gfyoung - mzeitlin11 + - twoertwein workgroups: coc: name: Code of Conduct diff --git a/web/pandas/getting_started.md b/web/pandas/getting_started.md index 0c4219e1ae12e..801081a9ef391 100644 --- a/web/pandas/getting_started.md +++ b/web/pandas/getting_started.md @@ -2,33 +2,8 @@ ## Installation instructions -The next steps provides the easiest and recommended way to set up your -environment to use pandas. Other installation options can be found in -the [advanced installation page]({{ base_url}}docs/getting_started/install.html). - -1. Download [Anaconda](https://www.anaconda.com/download/) for your operating system and - the latest Python version, run the installer, and follow the steps. Please note: - - - It is not needed (and discouraged) to install Anaconda as root or administrator. - - When asked if you wish to initialize Anaconda3, answer yes. - - Restart the terminal after completing the installation. - - Detailed instructions on how to install Anaconda can be found in the - [Anaconda documentation](https://docs.anaconda.com/anaconda/install/). - -2. In the Anaconda prompt (or terminal in Linux or macOS), start JupyterLab: - - - -3. In JupyterLab, create a new (Python 3) notebook: - - - -4. In the first cell of the notebook, you can import pandas and check the version with: - - - -5. Now you are ready to use pandas, and you can write your code in the next cells. +To install pandas, please reference the [installation page]({{ base_url}}docs/getting_started/install.html) +from the pandas documentation. ## Tutorials diff --git a/web/pandas/static/img/install/anaconda_prompt.png b/web/pandas/static/img/install/anaconda_prompt.png deleted file mode 100644 index 7b547e4ebb02a..0000000000000 Binary files a/web/pandas/static/img/install/anaconda_prompt.png and /dev/null differ diff --git a/web/pandas/static/img/install/jupyterlab_home.png b/web/pandas/static/img/install/jupyterlab_home.png deleted file mode 100644 index c62d33a5e0fc6..0000000000000 Binary files a/web/pandas/static/img/install/jupyterlab_home.png and /dev/null differ diff --git a/web/pandas/static/img/install/pandas_import_and_version.png b/web/pandas/static/img/install/pandas_import_and_version.png deleted file mode 100644 index 64c1303ac495c..0000000000000 Binary files a/web/pandas/static/img/install/pandas_import_and_version.png and /dev/null differ