From 656b6b49e41efcf8c194770e0b51c1796954c54a Mon Sep 17 00:00:00 2001 From: Hans Pagh Date: Thu, 10 Sep 2020 13:33:06 +0200 Subject: [PATCH 1/7] fix isin with nans and large arrays --- pandas/core/algorithms.py | 7 ++++++- pandas/tests/test_algos.py | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2ce3f2d9a7bfa..372223b38efe7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -440,7 +440,12 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception if len(comps) > 1_000_000 and not is_object_dtype(comps): - f = np.in1d + # If the the values include nan we need to check for nan explicitly + # since np.nan it not equal to np.nan + if any(np.isnan(values)): + f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) + else: + f = np.in1d elif is_integer_dtype(comps): try: values = values.astype("int64", copy=False) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a2c2ae22a0b62..6a82fe94e6605 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -801,7 +801,6 @@ def test_i8(self): tm.assert_numpy_array_equal(result, expected) def test_large(self): - s = pd.date_range("20000101", periods=2000000, freq="s").values result = algos.isin(s, s[0:2]) expected = np.zeros(len(s), dtype=bool) @@ -841,6 +840,13 @@ def test_same_nan_is_in(self): result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) + def test_same_nan_is_in_large(self): + s = np.tile(1.0, 1_000_001) + s[0] = np.nan + result = algos.isin(s, [np.nan, 1]) + expected = np.ones(len(s), dtype=bool) + tm.assert_numpy_array_equal(result, expected) + def test_same_object_is_in(self): # GH 22160 # there could be special treatment for nans From 246cab5e7604cee271050b0fa69733c2e0277fd3 Mon Sep 17 00:00:00 2001 From: Hans Pagh Date: Fri, 11 Sep 2020 09:14:44 +0200 Subject: [PATCH 2/7] use .any() instead of any() + whatsnew entry --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/algorithms.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 8ead78a17e9c2..dafbfa057b3f8 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -30,6 +30,7 @@ Bug fixes - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) - Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) +- Bug in :meth:`isin()` when using NaN and a row length above 1,000,000 (:issue:`22205`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 372223b38efe7..50d1810fee30d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -442,7 +442,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if len(comps) > 1_000_000 and not is_object_dtype(comps): # If the the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan - if any(np.isnan(values)): + if np.isnan(values).any(): f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d From 25d48c0f6ec764668f03ef6b4114b645347cab71 Mon Sep 17 00:00:00 2001 From: Hans Pagh Date: Mon, 14 Sep 2020 09:12:14 +0200 Subject: [PATCH 3/7] test series.isin --- pandas/tests/test_algos.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6a82fe94e6605..acc9bb21ad41c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -840,6 +840,7 @@ def test_same_nan_is_in(self): result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) + # issue:`22205` def test_same_nan_is_in_large(self): s = np.tile(1.0, 1_000_001) s[0] = np.nan @@ -847,6 +848,15 @@ def test_same_nan_is_in_large(self): expected = np.ones(len(s), dtype=bool) tm.assert_numpy_array_equal(result, expected) + # issue:`#25395` + def test_same_nan_is_in_large_series(self): + s = np.tile(1.0, 1_000_001) + series = pd.Series(s) + s[0] = np.nan + result = series.isin([np.nan, 1]) + expected = pd.Series(np.ones(len(s), dtype=bool)) + tm.assert_series_equal(result, expected) + def test_same_object_is_in(self): # GH 22160 # there could be special treatment for nans From 859cbf652ebad04faa8d2cf6b4d7355386762206 Mon Sep 17 00:00:00 2001 From: Hans Pagh Date: Mon, 14 Sep 2020 09:12:29 +0200 Subject: [PATCH 4/7] update whats new --- doc/source/whatsnew/v1.1.3.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index dafbfa057b3f8..5b1e77d917cf0 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -31,6 +31,7 @@ Bug fixes - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) - Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) - Bug in :meth:`isin()` when using NaN and a row length above 1,000,000 (:issue:`22205`) +- Bug in :meth:`Series.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) .. --------------------------------------------------------------------------- From 3679c143d7de98ac79ba2268b6e2e83ba1922edc Mon Sep 17 00:00:00 2001 From: Hans Pagh Date: Wed, 16 Sep 2020 13:40:31 +0200 Subject: [PATCH 5/7] docs --- doc/source/whatsnew/v1.1.3.rst | 3 +-- pandas/tests/test_algos.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 5b1e77d917cf0..603cfc38bbbbd 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -30,8 +30,7 @@ Bug fixes - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) - Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) -- Bug in :meth:`isin()` when using NaN and a row length above 1,000,000 (:issue:`22205`) -- Bug in :meth:`Series.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) +- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index acc9bb21ad41c..deedac49b8591 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -840,16 +840,16 @@ def test_same_nan_is_in(self): result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) - # issue:`22205` def test_same_nan_is_in_large(self): + # issue:`22205` s = np.tile(1.0, 1_000_001) s[0] = np.nan result = algos.isin(s, [np.nan, 1]) expected = np.ones(len(s), dtype=bool) tm.assert_numpy_array_equal(result, expected) - # issue:`#25395` def test_same_nan_is_in_large_series(self): + # issue:`#25395` s = np.tile(1.0, 1_000_001) series = pd.Series(s) s[0] = np.nan From 4e4359b086f2cb105f6765047198c8e58bc05d29 Mon Sep 17 00:00:00 2001 From: Hans Date: Thu, 17 Sep 2020 19:10:40 +0200 Subject: [PATCH 6/7] Update pandas/tests/test_algos.py Co-authored-by: Simon Hawkins --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index deedac49b8591..0adbfdd193226 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -841,7 +841,7 @@ def test_same_nan_is_in(self): tm.assert_numpy_array_equal(expected, result) def test_same_nan_is_in_large(self): - # issue:`22205` + # https://github.com/pandas-dev/pandas/issues/22205 s = np.tile(1.0, 1_000_001) s[0] = np.nan result = algos.isin(s, [np.nan, 1]) From 53ab2406479e7eda198f788de669294f277d4ba7 Mon Sep 17 00:00:00 2001 From: Hans Date: Thu, 17 Sep 2020 19:10:48 +0200 Subject: [PATCH 7/7] Update pandas/tests/test_algos.py Co-authored-by: Simon Hawkins --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 0adbfdd193226..6102f43f4db6a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -849,7 +849,7 @@ def test_same_nan_is_in_large(self): tm.assert_numpy_array_equal(result, expected) def test_same_nan_is_in_large_series(self): - # issue:`#25395` + # https://github.com/pandas-dev/pandas/issues/22205 s = np.tile(1.0, 1_000_001) series = pd.Series(s) s[0] = np.nan