Skip to content

Commit

Permalink
REF: merge_asof dont use values_for_argsort (pandas-dev#45475)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and yehoshuadimarsky committed Jul 13, 2022
1 parent 21ac7a9 commit a89b4a3
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 8 deletions.
32 changes: 24 additions & 8 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
from pandas.core import groupby
import pandas.core.algorithms as algos
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.frame import _merge_doc
Expand Down Expand Up @@ -1906,14 +1907,27 @@ def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]

def flip(xs) -> np.ndarray:
"""unlike np.transpose, this returns an array of tuples"""
# error: Item "ndarray" of "Union[Any, Union[ExtensionArray, ndarray]]" has
# no attribute "_values_for_argsort"
xs = [
x
if not is_extension_array_dtype(x)
else extract_array(x)._values_for_argsort() # type: ignore[union-attr]
for x in xs
]

def injection(obj):
if not is_extension_array_dtype(obj):
# ndarray
return obj
obj = extract_array(obj)
if isinstance(obj, NDArrayBackedExtensionArray):
# fastpath for e.g. dt64tz, categorical
return obj._ndarray
# FIXME: returning obj._values_for_argsort() here doesn't
# break in any existing test cases, but i (@jbrockmendel)
# am pretty sure it should!
# e.g.
# arr = pd.array([0, pd.NA, 255], dtype="UInt8")
# will have values_for_argsort (before GH#45434)
# np.array([0, 255, 255], dtype=np.uint8)
# and the non-injectivity should make a difference somehow
# shouldn't it?
return np.asarray(obj)

xs = [injection(x) for x in xs]
labels = list(string.ascii_lowercase[: len(xs)])
dtypes = [x.dtype for x in xs]
labeled_dtypes = list(zip(labels, dtypes))
Expand Down Expand Up @@ -1966,6 +1980,8 @@ def flip(xs) -> np.ndarray:
left_by_values = left_by_values[0]
right_by_values = right_by_values[0]
else:
# We get here with non-ndarrays in test_merge_by_col_tz_aware
# and test_merge_groupby_multiple_column_with_categorical_column
left_by_values = flip(left_by_values)
right_by_values = flip(right_by_values)

Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/reshape/merge/test_merge_asof.py
Original file line number Diff line number Diff line change
Expand Up @@ -1224,6 +1224,50 @@ def test_merge_on_nans(self, func, side):
else:
merge_asof(df, df_null, on="a")

def test_by_nullable(self, any_numeric_ea_dtype):
# Note: this test passes if instead of using pd.array we use
# np.array([np.nan, 1]). Other than that, I (@jbrockmendel)
# have NO IDEA what the expected behavior is.
# TODO(GH#32306): may be relevant to the expected behavior here.

arr = pd.array([pd.NA, 0, 1], dtype=any_numeric_ea_dtype)
if arr.dtype.kind in ["i", "u"]:
max_val = np.iinfo(arr.dtype.numpy_dtype).max
else:
max_val = np.finfo(arr.dtype.numpy_dtype).max
# set value s.t. (at least for integer dtypes) arr._values_for_argsort
# is not an injection
arr[2] = max_val

left = pd.DataFrame(
{
"by_col1": arr,
"by_col2": ["HELLO", "To", "You"],
"on_col": [2, 4, 6],
"value": ["a", "c", "e"],
}
)
right = pd.DataFrame(
{
"by_col1": arr,
"by_col2": ["WORLD", "Wide", "Web"],
"on_col": [1, 2, 6],
"value": ["b", "d", "f"],
}
)

result = merge_asof(left, right, by=["by_col1", "by_col2"], on="on_col")
expected = pd.DataFrame(
{
"by_col1": arr,
"by_col2": ["HELLO", "To", "You"],
"on_col": [2, 4, 6],
"value_x": ["a", "c", "e"],
}
)
expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object)
tm.assert_frame_equal(result, expected)

def test_merge_by_col_tz_aware(self):
# GH 21184
left = pd.DataFrame(
Expand Down Expand Up @@ -1309,6 +1353,7 @@ def test_timedelta_tolerance_nearest(self):

tm.assert_frame_equal(result, expected)

# TODO: any_int_dtype; causes failures in _get_join_indexers
def test_int_type_tolerance(self, any_int_numpy_dtype):
# GH #28870

Expand Down

0 comments on commit a89b4a3

Please sign in to comment.