diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9690efb57e87b..d2d844c05dc71 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -76,6 +76,7 @@ from pandas.core import groupby import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc @@ -1906,14 +1907,27 @@ def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp] def flip(xs) -> np.ndarray: """unlike np.transpose, this returns an array of tuples""" - # error: Item "ndarray" of "Union[Any, Union[ExtensionArray, ndarray]]" has - # no attribute "_values_for_argsort" - xs = [ - x - if not is_extension_array_dtype(x) - else extract_array(x)._values_for_argsort() # type: ignore[union-attr] - for x in xs - ] + + def injection(obj): + if not is_extension_array_dtype(obj): + # ndarray + return obj + obj = extract_array(obj) + if isinstance(obj, NDArrayBackedExtensionArray): + # fastpath for e.g. dt64tz, categorical + return obj._ndarray + # FIXME: returning obj._values_for_argsort() here doesn't + # break in any existing test cases, but i (@jbrockmendel) + # am pretty sure it should! + # e.g. + # arr = pd.array([0, pd.NA, 255], dtype="UInt8") + # will have values_for_argsort (before GH#45434) + # np.array([0, 255, 255], dtype=np.uint8) + # and the non-injectivity should make a difference somehow + # shouldn't it? + return np.asarray(obj) + + xs = [injection(x) for x in xs] labels = list(string.ascii_lowercase[: len(xs)]) dtypes = [x.dtype for x in xs] labeled_dtypes = list(zip(labels, dtypes)) @@ -1966,6 +1980,8 @@ def flip(xs) -> np.ndarray: left_by_values = left_by_values[0] right_by_values = right_by_values[0] else: + # We get here with non-ndarrays in test_merge_by_col_tz_aware + # and test_merge_groupby_multiple_column_with_categorical_column left_by_values = flip(left_by_values) right_by_values = flip(right_by_values) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 2b02b7ad39d6f..ea2f16eae6411 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1224,6 +1224,50 @@ def test_merge_on_nans(self, func, side): else: merge_asof(df, df_null, on="a") + def test_by_nullable(self, any_numeric_ea_dtype): + # Note: this test passes if instead of using pd.array we use + # np.array([np.nan, 1]). Other than that, I (@jbrockmendel) + # have NO IDEA what the expected behavior is. + # TODO(GH#32306): may be relevant to the expected behavior here. + + arr = pd.array([pd.NA, 0, 1], dtype=any_numeric_ea_dtype) + if arr.dtype.kind in ["i", "u"]: + max_val = np.iinfo(arr.dtype.numpy_dtype).max + else: + max_val = np.finfo(arr.dtype.numpy_dtype).max + # set value s.t. (at least for integer dtypes) arr._values_for_argsort + # is not an injection + arr[2] = max_val + + left = pd.DataFrame( + { + "by_col1": arr, + "by_col2": ["HELLO", "To", "You"], + "on_col": [2, 4, 6], + "value": ["a", "c", "e"], + } + ) + right = pd.DataFrame( + { + "by_col1": arr, + "by_col2": ["WORLD", "Wide", "Web"], + "on_col": [1, 2, 6], + "value": ["b", "d", "f"], + } + ) + + result = merge_asof(left, right, by=["by_col1", "by_col2"], on="on_col") + expected = pd.DataFrame( + { + "by_col1": arr, + "by_col2": ["HELLO", "To", "You"], + "on_col": [2, 4, 6], + "value_x": ["a", "c", "e"], + } + ) + expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) + tm.assert_frame_equal(result, expected) + def test_merge_by_col_tz_aware(self): # GH 21184 left = pd.DataFrame( @@ -1309,6 +1353,7 @@ def test_timedelta_tolerance_nearest(self): tm.assert_frame_equal(result, expected) + # TODO: any_int_dtype; causes failures in _get_join_indexers def test_int_type_tolerance(self, any_int_numpy_dtype): # GH #28870