Merge branch 'master' into dev23122

pandas-dev · Dec 21, 2021 · 59c0043 · 59c0043
2 parents 6566435 + d228a78
commit 59c0043
Show file tree

Hide file tree

Showing 266 changed files with 6,377 additions and 4,280 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml
@@ -7,6 +7,7 @@ body:
   - type: checkboxes
     id: checks
     attributes:
+      label: Pandas version checks
       options:
         - label: >
             I have checked that this issue has not already been reported.

diff --git a/.github/ISSUE_TEMPLATE/documentation_improvement.yaml b/.github/ISSUE_TEMPLATE/documentation_improvement.yaml
@@ -6,6 +6,7 @@ labels: [Docs, Needs Triage]
 body:
   - type: checkboxes
     attributes:
+      label: Pandas version checks
       options:
         - label: >
             I have checked that the issue still exists on the latest versions of the docs

diff --git a/.github/ISSUE_TEMPLATE/installation_issue.yaml b/.github/ISSUE_TEMPLATE/installation_issue.yaml
@@ -7,6 +7,7 @@ body:
   - type: checkboxes
     id: checks
     attributes:
+      label: Installation check
       options:
         - label: >
             I have read the [installation guide](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-pandas).

diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yaml b/.github/ISSUE_TEMPLATE/performance_issue.yaml
@@ -7,6 +7,7 @@ body:
   - type: checkboxes
     id: checks
     attributes:
+      label: Pandas version checks
       options:
         - label: >
             I have checked that this issue has not already been reported.

diff --git a/.github/ISSUE_TEMPLATE/submit_question.yml b/.github/ISSUE_TEMPLATE/submit_question.yml
@@ -11,6 +11,7 @@ body:
         usage questions, we ask that all usage questions are first asked on StackOverflow.
   - type: checkboxes
     attributes:
+      label: Research
       options:
         - label: >
             I have searched the [[pandas] tag](https://stackoverflow.com/questions/tagged/pandas)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -78,6 +78,40 @@ jobs:
       run: pytest scripts
       if: always()
 
+  benchmarks:
+    name: Benchmarks
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    concurrency:
+      # https://github.community/t/concurrecy-not-work-for-push/183068/7
+      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-benchmarks
+      cancel-in-progress: true
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Cache conda
+      uses: actions/cache@v2
+      with:
+        path: ~/conda_pkgs_dir
+        key: ${{ runner.os }}-conda-${{ hashFiles('${{ env.ENV_FILE }}') }}
+
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        activate-environment: pandas-dev
+        channel-priority: strict
+        environment-file: ${{ env.ENV_FILE }}
+        use-only-tar-bz2: true
+
+    - name: Build Pandas
+      uses: ./.github/actions/build_pandas
+
     - name: Running benchmarks
       run: |
         cd asv_bench

diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml
@@ -31,12 +31,12 @@ jobs:
           [actions-38-slow.yaml, "slow", "", "", "", "", ""],
           [actions-38-locale.yaml, "not slow and not network", "language-pack-zh-hans xsel", "zh_CN.utf8", "zh_CN.utf8", "", ""],
           [actions-39-slow.yaml, "slow", "", "", "", "", ""],
+          [actions-pypy-38.yaml, "not slow and not clipboard", "", "", "", "", "--max-worker-restart 0"],
           [actions-39-numpydev.yaml, "not slow and not network", "xsel", "", "", "deprecate", "-W error"],
           [actions-39.yaml, "not slow and not clipboard", "", "", "", "", ""]
         ]
       fail-fast: false
     env:
-      COVERAGE: true
       ENV_FILE: ci/deps/${{ matrix.settings[0] }}
       PATTERN: ${{ matrix.settings[1] }}
       EXTRA_APT: ${{ matrix.settings[2] }}
@@ -45,6 +45,9 @@ jobs:
       PANDAS_TESTING_MODE: ${{ matrix.settings[5] }}
       TEST_ARGS: ${{ matrix.settings[6] }}
       PYTEST_TARGET:  pandas
+      IS_PYPY: ${{ contains(matrix.settings[0], 'pypy') }}
+      # TODO: re-enable coverage on pypy, its slow
+      COVERAGE: ${{ !contains(matrix.settings[0], 'pypy') }}
     concurrency:
       # https://github.community/t/concurrecy-not-work-for-push/183068/7
       group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.settings[0] }}
@@ -82,12 +85,29 @@ jobs:
         channel-priority: flexible
         environment-file: ${{ env.ENV_FILE }}
         use-only-tar-bz2: true
+      if: ${{ env.IS_PYPY == 'false' }} # No pypy3.8 support
+
+    - name: Setup PyPy
+      uses: actions/setup-python@v2.3.1
+      with:
+        python-version: "pypy-3.8"
+      if: ${{ env.IS_PYPY == 'true' }}
+
+    - name: Setup PyPy dependencies
+      shell: bash
+      run: |
+        # TODO: re-enable cov, its slowing the tests down though
+        # TODO: Unpin Cython, the new Cython 0.29.26 is causing compilation errors
+        pip install Cython==0.29.25 numpy python-dateutil pytz pytest>=6.0 pytest-xdist>=1.31.0 hypothesis>=5.5.3
+      if: ${{ env.IS_PYPY == 'true' }}
 
     - name: Build Pandas
       uses: ./.github/actions/build_pandas
 
     - name: Test
       run: ci/run_tests.sh
+      # TODO: Don't continue on error for PyPy
+      continue-on-error: ${{ env.IS_PYPY == 'true' }}
       if: always()
 
     - name: Build Version

diff --git a/.gitignore b/.gitignore
@@ -50,6 +50,8 @@ dist
 *.egg-info
 .eggs
 .pypirc
+# type checkers
+pandas/py.typed
 
 # tox testing tool
 .tox

diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
@@ -144,7 +144,7 @@ def setup(self, op, shape):
         # should already be the case, but just to be sure
         df._consolidate_inplace()
 
-        # TODO: GH#33198 the setting here shoudlnt need two steps
+        # TODO: GH#33198 the setting here shouldn't need two steps
         arr1 = np.random.randn(n_rows, max(n_cols // 4, 3)).astype("f8")
         arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8")
         arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8")

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -55,6 +55,26 @@ def time_frame(self, kind):
         self.df.to_csv(self.fname)
 
 
+class ToCSVMultiIndexUnusedLevels(BaseIO):
+
+    fname = "__test__.csv"
+
+    def setup(self):
+        df = DataFrame({"a": np.random.randn(100_000), "b": 1, "c": 1})
+        self.df = df.set_index(["a", "b"])
+        self.df_unused_levels = self.df.iloc[:10_000]
+        self.df_single_index = df.set_index(["a"]).iloc[:10_000]
+
+    def time_full_frame(self):
+        self.df.to_csv(self.fname)
+
+    def time_sliced_frame(self):
+        self.df_unused_levels.to_csv(self.fname)
+
+    def time_single_index_frame(self):
+        self.df_single_index.to_csv(self.fname)
+
+
 class ToCSVDatetime(BaseIO):
 
     fname = "__test__.csv"
@@ -67,6 +87,21 @@ def time_frame_date_formatting(self):
         self.data.to_csv(self.fname, date_format="%Y%m%d")
 
 
+class ToCSVDatetimeIndex(BaseIO):
+
+    fname = "__test__.csv"
+
+    def setup(self):
+        rng = date_range("2000", periods=100_000, freq="S")
+        self.data = DataFrame({"a": 1}, index=rng)
+
+    def time_frame_date_formatting_index(self):
+        self.data.to_csv(self.fname, date_format="%Y-%m-%d %H:%M:%S")
+
+    def time_frame_date_no_format_index(self):
+        self.data.to_csv(self.fname)
+
+
 class ToCSVDatetimeBig(BaseIO):
 
     fname = "__test__.csv"

diff --git a/ci/deps/actions-38-db.yaml b/ci/deps/actions-38-db.yaml
@@ -12,7 +12,7 @@ dependencies:
   - pytest-cov>=2.10.1  # this is only needed in the coverage build, ref: GH 35737
 
   # pandas dependencies
-  - aiobotocore<2.0.0
+  - aiobotocore<2.0.0  # GH#44311 pinned to fix docbuild
   - beautifulsoup4
   - boto3
   - botocore>=1.11

diff --git a/ci/deps/actions-pypy-38.yaml b/ci/deps/actions-pypy-38.yaml
@@ -0,0 +1,20 @@
+name: pandas-dev
+channels:
+  - conda-forge
+dependencies:
+  # TODO: Add the rest of the dependencies in here
+  # once the other plentiful failures/segfaults
+  # with base pandas has been dealt with
+  - python=3.8[build=*_pypy] # TODO: use this once pypy3.8 is available
+
+  # tools
+  - cython>=0.29.24
+  - pytest>=6.0
+  - pytest-cov
+  - pytest-xdist>=1.31
+  - hypothesis>=5.5.3
+
+  # required
+  - numpy
+  - python-dateutil
+  - pytz
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
@@ -5,12 +5,17 @@
 # https://github.com/pytest-dev/pytest/issues/1075
 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')
 
+# May help reproduce flaky CI builds if set in subsequent runs
+echo PYTHONHASHSEED=$PYTHONHASHSEED
+
 if [[ "not network" == *"$PATTERN"* ]]; then
     export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4;
 fi
 
-if [ "$COVERAGE" ]; then
+if [[ "$COVERAGE" == "true" ]]; then
     COVERAGE="-s --cov=pandas --cov-report=xml --cov-append"
+else
+    COVERAGE="" # We need to reset this for COVERAGE="false" case
 fi
 
 # If no X server is found, we use xvfb to emulate it

diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
@@ -303,7 +303,7 @@ pandas strongly encourages the use of :pep:`484` style type hints. New developme
 Style guidelines
 ~~~~~~~~~~~~~~~~
 
-Types imports should follow the ``from typing import ...`` convention. So rather than
+Type imports should follow the ``from typing import ...`` convention. Some types do not need to be imported since :pep:`585` some builtin constructs, such as ``list`` and ``tuple``, can directly be used for type annotations. So rather than
 
 .. code-block:: python
 
@@ -315,21 +315,31 @@ You should write
 
 .. code-block:: python
 
-   from typing import List, Optional, Union
+   primes: list[int] = []
 
-   primes: List[int] = []
+``Optional`` should be  avoided in favor of the shorter ``| None``, so instead of
 
-``Optional`` should be used where applicable, so instead of
+.. code-block:: python
+
+   from typing import Union
+
+   maybe_primes: list[Union[int, None]] = []
+
+or
 
 .. code-block:: python
 
-   maybe_primes: List[Union[int, None]] = []
+   from typing import Optional
+
+   maybe_primes: list[Optional[int]] = []
 
 You should write
 
 .. code-block:: python
 
-   maybe_primes: List[Optional[int]] = []
+   from __future__ import annotations  # noqa: F404
+
+   maybe_primes: list[int | None] = []
 
 In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 <https://github.com/python/mypy/issues/1775#issuecomment-310969854>`_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like
 
@@ -410,6 +420,26 @@ A recent version of ``numpy`` (>=1.21.0) is required for type validation.
 
 .. _contributing.ci:
 
+Testing type hints in code using pandas
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. warning::
+
+    * Pandas is not yet a py.typed library (:pep:`561`)!
+      The primary purpose of locally declaring pandas as a py.typed library is to test and
+      improve the pandas-builtin type annotations.
+
+Until pandas becomes a py.typed library, it is possible to easily experiment with the type
+annotations shipped with pandas by creating an empty file named "py.typed" in the pandas
+installation folder:
+
+.. code-block:: none
+
+   python -c "import pandas; import pathlib; (pathlib.Path(pandas.__path__[0]) / 'py.typed').touch()"
+
+The existence of the py.typed file signals to type checkers that pandas is already a py.typed
+library. This makes type checkers aware of the type annotations shipped with pandas.
+
 Testing with continuous integration
 -----------------------------------
 

diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst
@@ -180,7 +180,7 @@ As an example of fully-formed metadata:
          'numpy_type': 'int64',
          'metadata': None}
     ],
-    'pandas_version': '0.20.0',
+    'pandas_version': '1.4.0',
     'creator': {
       'library': 'pyarrow',
       'version': '0.13.0'

diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst
@@ -122,6 +122,7 @@ application to columns of a specific data type.
    DataFrameGroupBy.skew
    DataFrameGroupBy.take
    DataFrameGroupBy.tshift
+   DataFrameGroupBy.value_counts
 
 The following methods are available only for ``SeriesGroupBy`` objects.
 

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1903,6 +1903,7 @@ with optional parameters:
      ``index``; dict like {index -> {column -> value}}
      ``columns``; dict like {column -> {index -> value}}
      ``values``; just the values array
+     ``table``; adhering to the JSON `Table Schema`_
 
 * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601.
 * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10.
@@ -2477,7 +2478,6 @@ A few notes on the generated table schema:
     * For ``MultiIndex``, ``mi.names`` is used. If any level has no name,
       then ``level_<i>`` is used.
 
-
 ``read_json`` also accepts ``orient='table'`` as an argument. This allows for
 the preservation of metadata such as dtypes and index names in a
 round-trippable manner.
@@ -2519,8 +2519,18 @@ indicate missing values and the subsequent read cannot distinguish the intent.
 
    os.remove("test.json")
 
+When using ``orient='table'`` along with user-defined ``ExtensionArray``,
+the generated schema will contain an additional ``extDtype`` key in the respective
+``fields`` element. This extra key is not standard but does enable JSON roundtrips
+for extension types (e.g. ``read_json(df.to_json(orient="table"), orient="table")``).
+
+The ``extDtype`` key carries the name of the extension, if you have properly registered
+the ``ExtensionDtype``, pandas will use said name to perform a lookup into the registry
+and re-convert the serialized data into your custom dtype.
+
 .. _Table Schema: https://specs.frictionlessdata.io/table-schema/
 
+
 HTML
 ----
 

diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -2424,7 +2424,7 @@ you can use the ``tz_convert`` method.
 
     For ``pytz`` time zones, it is incorrect to pass a time zone object directly into
     the ``datetime.datetime`` constructor
-    (e.g., ``datetime.datetime(2011, 1, 1, tz=pytz.timezone('US/Eastern'))``.
+    (e.g., ``datetime.datetime(2011, 1, 1, tzinfo=pytz.timezone('US/Eastern'))``.
     Instead, the datetime needs to be localized using the ``localize`` method
     on the ``pytz`` time zone object.