diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 8ead78a17e9c2..603cfc38bbbbd 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -30,6 +30,7 @@ Bug fixes - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) - Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) +- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2ce3f2d9a7bfa..50d1810fee30d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -440,7 +440,12 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception if len(comps) > 1_000_000 and not is_object_dtype(comps): - f = np.in1d + # If the the values include nan we need to check for nan explicitly + # since np.nan it not equal to np.nan + if np.isnan(values).any(): + f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) + else: + f = np.in1d elif is_integer_dtype(comps): try: values = values.astype("int64", copy=False) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a2c2ae22a0b62..6102f43f4db6a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -801,7 +801,6 @@ def test_i8(self): tm.assert_numpy_array_equal(result, expected) def test_large(self): - s = pd.date_range("20000101", periods=2000000, freq="s").values result = algos.isin(s, s[0:2]) expected = np.zeros(len(s), dtype=bool) @@ -841,6 +840,23 @@ def test_same_nan_is_in(self): result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) + def test_same_nan_is_in_large(self): + # https://github.com/pandas-dev/pandas/issues/22205 + s = np.tile(1.0, 1_000_001) + s[0] = np.nan + result = algos.isin(s, [np.nan, 1]) + expected = np.ones(len(s), dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + def test_same_nan_is_in_large_series(self): + # https://github.com/pandas-dev/pandas/issues/22205 + s = np.tile(1.0, 1_000_001) + series = pd.Series(s) + s[0] = np.nan + result = series.isin([np.nan, 1]) + expected = pd.Series(np.ones(len(s), dtype=bool)) + tm.assert_series_equal(result, expected) + def test_same_object_is_in(self): # GH 22160 # there could be special treatment for nans