diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index 6cb728800dc68..277edd15a9322 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`read_csv` raising a ``ValueError`` when ``names`` was of type ``dict_keys`` (:issue:`36928`) +- Fixed regression in :func:`read_csv` with more than 1M rows and specifying a ``index_col`` argument (:issue:`37094`) - Fixed regression where attempting to mutate a :class:`DateOffset` object would no longer raise an ``AttributeError`` (:issue:`36940`) - Fixed regression where :meth:`DataFrame.agg` would fail with :exc:`TypeError` when passed positional arguments to be passed on to the aggregation function (:issue:`36948`). - Fixed regression in :class:`RollingGroupby` with ``sort=False`` not being respected (:issue:`36889`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9a3144d1ccbaa..a310ec5312cf4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -441,7 +441,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if len(comps) > 1_000_000 and not is_object_dtype(comps): # If the the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan - if np.isnan(values).any(): + if isna(values).any(): f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 4d64f2bf411bd..9c6cad4b41949 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -207,3 +207,18 @@ def test_header_with_index_col(all_parsers): result = parser.read_csv(StringIO(data), index_col="I11", header=0) tm.assert_frame_equal(result, expected) + + +@pytest.mark.slow +def test_index_col_large_csv(all_parsers): + # https://github.com/pandas-dev/pandas/issues/37094 + parser = all_parsers + + N = 1_000_001 + df = DataFrame({"a": range(N), "b": np.random.randn(N)}) + + with tm.ensure_clean() as path: + df.to_csv(path, index=False) + result = parser.read_csv(path, index_col=[0]) + + tm.assert_frame_equal(result, df.set_index("a")) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 62766c692f4df..86ea2b2f02a4d 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -89,3 +89,13 @@ def test_isin_read_only(self): result = s.isin(arr) expected = Series([True, True, True]) tm.assert_series_equal(result, expected) + + +@pytest.mark.slow +def test_isin_large_series_mixed_dtypes_and_nan(): + # https://github.com/pandas-dev/pandas/issues/37094 + # combination of object dtype for the values and > 1_000_000 elements + ser = Series([1, 2, np.nan] * 1_000_000) + result = ser.isin({"foo", "bar"}) + expected = Series([False] * 3 * 1_000_000) + tm.assert_series_equal(result, expected)