Skip to content

Commit

Permalink
Merge branch 'main' into triage-doc-changes
Browse files Browse the repository at this point in the history
  • Loading branch information
chris-caballero committed Oct 4, 2023
2 parents bd096c0 + 6a83910 commit 6119771
Show file tree
Hide file tree
Showing 58 changed files with 520 additions and 119 deletions.
18 changes: 17 additions & 1 deletion asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from importlib import import_module

import numpy as np
import pyarrow as pa

import pandas as pd

Expand Down Expand Up @@ -72,7 +73,16 @@ class Duplicated:
params = [
[True, False],
["first", "last", False],
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
[
"int",
"uint",
"float",
"string",
"datetime64[ns]",
"datetime64[ns, tz]",
"timestamp[ms][pyarrow]",
"duration[s][pyarrow]",
],
]
param_names = ["unique", "keep", "dtype"]

Expand All @@ -87,6 +97,12 @@ def setup(self, unique, keep, dtype):
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"timestamp[ms][pyarrow]": pd.Index(
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
),
"duration[s][pyarrow]": pd.Index(
np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
),
}[dtype]
if not unique:
data = data.repeat(5)
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ objects.
api.extensions.ExtensionArray.copy
api.extensions.ExtensionArray.view
api.extensions.ExtensionArray.dropna
api.extensions.ExtensionArray.duplicated
api.extensions.ExtensionArray.equals
api.extensions.ExtensionArray.factorize
api.extensions.ExtensionArray.fillna
Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/10min.rst
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ Merge
Concat
~~~~~~

pandas provides various facilities for easily combining together :class:`Series`` and
pandas provides various facilities for easily combining together :class:`Series` and
:class:`DataFrame` objects with various kinds of set logic for the indexes
and relational algebra functionality in the case of join / merge-type
operations.
Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/enhancingperf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ can be improved by passing an ``np.ndarray``.
...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b,
...: np.ndarray col_N):
...: assert (col_a.dtype == np.float64
...: and col_b.dtype == np.float64 and col_N.dtype == np.int_)
...: and col_b.dtype == np.float64 and col_N.dtype == np.dtype(int))
...: cdef Py_ssize_t i, n = len(col_N)
...: assert (len(col_a) == len(col_b) == n)
...: cdef np.ndarray[double] res = np.empty(n)
Expand Down
7 changes: 7 additions & 0 deletions doc/source/whatsnew/v2.1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,15 @@ Fixed regressions

Bug fixes
~~~~~~~~~
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`)
- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`)
- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
-

.. ---------------------------------------------------------------------------
Expand Down
36 changes: 33 additions & 3 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ Other enhancements

- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
-
Expand Down Expand Up @@ -132,10 +133,36 @@ and ``sort=False``:
result
.. _whatsnew_220.notable_bug_fixes.notable_bug_fix2:
.. _whatsnew_220.notable_bug_fixes.multiindex_join_different_levels:

notable_bug_fix2
^^^^^^^^^^^^^^^^
:func:`merge` and :meth:`DataFrame.join` no longer reorder levels when levels differ
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` would reorder
index levels when joining on two indexes with different levels (:issue:`34133`).

.. ipython:: python
left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"]))
right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"]))
result = left.join(right)
*Old Behavior*

.. code-block:: ipython
In [5]: result
Out[5]:
left right
B A C
1 x 1 1 2
2 x 2 1 2
*New Behavior*

.. ipython:: python
result
.. ---------------------------------------------------------------------------
.. _whatsnew_220.api_breaking:
Expand Down Expand Up @@ -241,6 +268,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
- Performance improvement when localizing time to UTC (:issue:`55241`)

Expand All @@ -252,6 +280,7 @@ Bug fixes
- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
- Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
- Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`)

Categorical
Expand Down Expand Up @@ -339,6 +368,7 @@ Reshaping
^^^^^^^^^
- Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`)
- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
-

Sparse
^^^^^^
Expand Down
15 changes: 15 additions & 0 deletions pandas/compat/numpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,21 @@
)


np_long: type
np_ulong: type

if _nlv >= Version("2.0.0.dev0"):
try:
np_long = np.long # type: ignore[attr-defined]
np_ulong = np.ulong # type: ignore[attr-defined]
except AttributeError:
np_long = np.int_
np_ulong = np.uint
else:
np_long = np.int_
np_ulong = np.uint


__all__ = [
"np",
"_np_version",
Expand Down
21 changes: 8 additions & 13 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
ArrowDtype,
BaseMaskedDtype,
CategoricalDtype,
ExtensionDtype,
Expand Down Expand Up @@ -979,36 +978,32 @@ def value_counts_arraylike(


def duplicated(
values: ArrayLike, keep: Literal["first", "last", False] = "first"
values: ArrayLike,
keep: Literal["first", "last", False] = "first",
mask: npt.NDArray[np.bool_] | None = None,
) -> npt.NDArray[np.bool_]:
"""
Return boolean ndarray denoting duplicate values.
Parameters
----------
values : nd.array, ExtensionArray or Series
values : np.ndarray or ExtensionArray
Array over which to check for duplicate values.
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first
occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last
occurrence.
- False : Mark all duplicates as ``True``.
mask : ndarray[bool], optional
array indicating which elements to exclude from checking
Returns
-------
duplicated : ndarray[bool]
"""
if hasattr(values, "dtype"):
if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub":
values = values._to_masked() # type: ignore[union-attr]

if isinstance(values.dtype, BaseMaskedDtype):
values = cast("BaseMaskedArray", values)
return htable.duplicated(values._data, keep=keep, mask=values._mask)

values = _ensure_data(values)
return htable.duplicated(values, keep=keep)
return htable.duplicated(values, keep=keep, mask=mask)


def mode(
Expand Down Expand Up @@ -1641,7 +1636,7 @@ def safe_sort(
else:
mask = None
else:
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer = np.empty(len(sorter), dtype=int)
reverse_indexer.put(sorter, np.arange(len(sorter)))
# Out of bound indices will be masked with `-1` next, so we
# may deal with them here without performance loss using `mode='wrap'`
Expand Down
78 changes: 68 additions & 10 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,12 @@
from pandas.util._decorators import doc
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.cast import can_hold_element
from pandas.core.dtypes.cast import (
can_hold_element,
infer_dtype_from_scalar,
)
from pandas.core.dtypes.common import (
CategoricalDtype,
is_array_like,
is_bool_dtype,
is_integer,
Expand All @@ -42,6 +46,7 @@
from pandas.core.dtypes.missing import isna

from pandas.core import (
algorithms as algos,
missing,
roperator,
)
Expand Down Expand Up @@ -627,7 +632,9 @@ def __setstate__(self, state) -> None:

def _cmp_method(self, other, op):
pc_func = ARROW_CMP_FUNCS[op.__name__]
if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)):
if isinstance(
other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)
) or isinstance(getattr(other, "dtype", None), CategoricalDtype):
result = pc_func(self._pa_array, self._box_pa(other))
elif is_scalar(other):
try:
Expand Down Expand Up @@ -1289,6 +1296,30 @@ def to_numpy(
result[~mask] = data[~mask]._pa_array.to_numpy()
return result

@doc(ExtensionArray.duplicated)
def duplicated(
self, keep: Literal["first", "last", False] = "first"
) -> npt.NDArray[np.bool_]:
pa_type = self._pa_array.type
if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
values = self.to_numpy(na_value=0)
elif pa.types.is_boolean(pa_type):
values = self.to_numpy(na_value=False)
elif pa.types.is_temporal(pa_type):
if pa_type.bit_width == 32:
pa_type = pa.int32()
else:
pa_type = pa.int64()
arr = self.astype(ArrowDtype(pa_type))
values = arr.to_numpy(na_value=0)
else:
# factorize the values to avoid the performance penalty of
# converting to object dtype
values = self.factorize()[0]

mask = self.isna() if self._hasna else None
return algos.duplicated(values, keep=keep, mask=mask)

def unique(self) -> Self:
"""
Compute the ArrowExtensionArray of unique values.
Expand Down Expand Up @@ -1599,13 +1630,21 @@ def _reduce(
pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)

if keepdims:
result = pa.array([pa_result.as_py()], type=pa_result.type)
if isinstance(pa_result, pa.Scalar):
result = pa.array([pa_result.as_py()], type=pa_result.type)
else:
result = pa.array(
[pa_result],
type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]),
)
return type(self)(result)

if pc.is_null(pa_result).as_py():
return self.dtype.na_value
else:
elif isinstance(pa_result, pa.Scalar):
return pa_result.as_py()
else:
return pa_result

def _explode(self):
"""
Expand Down Expand Up @@ -1708,7 +1747,7 @@ def __setitem__(self, key, value) -> None:
data = pa.chunked_array([data])
self._pa_array = data

def _rank(
def _rank_calc(
self,
*,
axis: AxisInt = 0,
Expand All @@ -1717,9 +1756,6 @@ def _rank(
ascending: bool = True,
pct: bool = False,
):
"""
See Series.rank.__doc__.
"""
if pa_version_under9p0 or axis != 0:
ranked = super()._rank(
axis=axis,
Expand All @@ -1734,7 +1770,7 @@ def _rank(
else:
pa_type = pa.uint64()
result = pa.array(ranked, type=pa_type, from_pandas=True)
return type(self)(result)
return result

data = self._pa_array.combine_chunks()
sort_keys = "ascending" if ascending else "descending"
Expand Down Expand Up @@ -1773,7 +1809,29 @@ def _rank(
divisor = pc.count(result)
result = pc.divide(result, divisor)

return type(self)(result)
return result

def _rank(
self,
*,
axis: AxisInt = 0,
method: str = "average",
na_option: str = "keep",
ascending: bool = True,
pct: bool = False,
):
"""
See Series.rank.__doc__.
"""
return type(self)(
self._rank_calc(
axis=axis,
method=method,
na_option=na_option,
ascending=ascending,
pct=pct,
)
)

def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self:
"""
Expand Down
Loading

0 comments on commit 6119771

Please sign in to comment.