From 3a1181a864e0b4b3f6c715aee90ad92b25bef3c3 Mon Sep 17 00:00:00 2001 From: Hendrik Makait Date: Mon, 1 Jul 2024 09:48:00 -0400 Subject: [PATCH] Drop support for pandas 1.X --- distributed/shuffle/tests/test_merge.py | 10 ++-- distributed/shuffle/tests/test_shuffle.py | 69 +++++++++++------------ 2 files changed, 37 insertions(+), 42 deletions(-) diff --git a/distributed/shuffle/tests/test_merge.py b/distributed/shuffle/tests/test_merge.py index 42d1f443a0..8c196af3c0 100644 --- a/distributed/shuffle/tests/test_merge.py +++ b/distributed/shuffle/tests/test_merge.py @@ -17,7 +17,7 @@ pd = pytest.importorskip("pandas") import dask import dask.dataframe as dd -from dask.dataframe._compat import PANDAS_GE_200, tm +from dask.dataframe._compat import tm from dask.dataframe.utils import assert_eq from distributed import get_client @@ -293,7 +293,7 @@ async def test_merge_by_multiple_columns(c, s, a, b, how): # FIXME: There's an discrepancy with an empty index for # pandas=2.0 (xref https://github.com/dask/dask/issues/9957). # Temporarily avoid index check until the discrepancy is fixed. - check_index=not (PANDAS_GE_200 and expected.index.empty), + check_index=not expected.index.empty, ) expected = pdr.join(pdl, how=how) @@ -303,7 +303,7 @@ async def test_merge_by_multiple_columns(c, s, a, b, how): # FIXME: There's an discrepancy with an empty index for # pandas=2.0 (xref https://github.com/dask/dask/issues/9957). # Temporarily avoid index check until the discrepancy is fixed. - check_index=not (PANDAS_GE_200 and expected.index.empty), + check_index=not expected.index.empty, ) expected = pd.merge( @@ -323,7 +323,7 @@ async def test_merge_by_multiple_columns(c, s, a, b, how): # FIXME: There's an discrepancy with an empty index for # pandas=2.0 (xref https://github.com/dask/dask/issues/9957). # Temporarily avoid index check until the discrepancy is fixed. - check_index=not (PANDAS_GE_200 and expected.index.empty), + check_index=not expected.index.empty, ) expected = pd.merge( @@ -343,7 +343,7 @@ async def test_merge_by_multiple_columns(c, s, a, b, how): # FIXME: There's an discrepancy with an empty index for # pandas=2.0 (xref https://github.com/dask/dask/issues/9957). # Temporarily avoid index check until the discrepancy is fixed. - check_index=not (PANDAS_GE_200 and expected.index.empty), + check_index=not expected.index.empty, ) # hash join diff --git a/distributed/shuffle/tests/test_shuffle.py b/distributed/shuffle/tests/test_shuffle.py index 3d94ef81be..24e199d1b6 100644 --- a/distributed/shuffle/tests/test_shuffle.py +++ b/distributed/shuffle/tests/test_shuffle.py @@ -28,7 +28,6 @@ pd = pytest.importorskip("pandas") import dask.dataframe as dd -from dask.dataframe._compat import PANDAS_GE_150, PANDAS_GE_200 from dask.typing import Key from distributed import ( @@ -1145,41 +1144,38 @@ def __init__(self, value: int) -> None: } ) - if PANDAS_GE_150: - columns.update( - { - # PyArrow dtypes - f"col{next(counter)}": pd.array( - [True, False] * 50, dtype="bool[pyarrow]" - ), - f"col{next(counter)}": pd.array(range(100), dtype="int8[pyarrow]"), - f"col{next(counter)}": pd.array(range(100), dtype="int16[pyarrow]"), - f"col{next(counter)}": pd.array(range(100), dtype="int32[pyarrow]"), - f"col{next(counter)}": pd.array(range(100), dtype="int64[pyarrow]"), - f"col{next(counter)}": pd.array(range(100), dtype="uint8[pyarrow]"), - f"col{next(counter)}": pd.array(range(100), dtype="uint16[pyarrow]"), - f"col{next(counter)}": pd.array(range(100), dtype="uint32[pyarrow]"), - f"col{next(counter)}": pd.array(range(100), dtype="uint64[pyarrow]"), - f"col{next(counter)}": pd.array(range(100), dtype="float32[pyarrow]"), - f"col{next(counter)}": pd.array(range(100), dtype="float64[pyarrow]"), - f"col{next(counter)}": pd.array( - [pd.Timestamp.fromtimestamp(1641034800 + i) for i in range(100)], - dtype=pd.ArrowDtype(pa.timestamp("ms")), - ), - f"col{next(counter)}": pd.array( - ["lorem ipsum"] * 100, - dtype="string[pyarrow]", - ), - f"col{next(counter)}": pd.array( - ["lorem ipsum"] * 100, - dtype=pd.StringDtype("pyarrow"), - ), - f"col{next(counter)}": pd.array( - ["lorem ipsum"] * 100, - dtype="string[python]", - ), - } - ) + columns.update( + { + # PyArrow dtypes + f"col{next(counter)}": pd.array([True, False] * 50, dtype="bool[pyarrow]"), + f"col{next(counter)}": pd.array(range(100), dtype="int8[pyarrow]"), + f"col{next(counter)}": pd.array(range(100), dtype="int16[pyarrow]"), + f"col{next(counter)}": pd.array(range(100), dtype="int32[pyarrow]"), + f"col{next(counter)}": pd.array(range(100), dtype="int64[pyarrow]"), + f"col{next(counter)}": pd.array(range(100), dtype="uint8[pyarrow]"), + f"col{next(counter)}": pd.array(range(100), dtype="uint16[pyarrow]"), + f"col{next(counter)}": pd.array(range(100), dtype="uint32[pyarrow]"), + f"col{next(counter)}": pd.array(range(100), dtype="uint64[pyarrow]"), + f"col{next(counter)}": pd.array(range(100), dtype="float32[pyarrow]"), + f"col{next(counter)}": pd.array(range(100), dtype="float64[pyarrow]"), + f"col{next(counter)}": pd.array( + [pd.Timestamp.fromtimestamp(1641034800 + i) for i in range(100)], + dtype=pd.ArrowDtype(pa.timestamp("ms")), + ), + f"col{next(counter)}": pd.array( + ["lorem ipsum"] * 100, + dtype="string[pyarrow]", + ), + f"col{next(counter)}": pd.array( + ["lorem ipsum"] * 100, + dtype=pd.StringDtype("pyarrow"), + ), + f"col{next(counter)}": pd.array( + ["lorem ipsum"] * 100, + dtype="string[python]", + ), + } + ) df = pd.DataFrame(columns) df["_partitions"] = df.col4 % npartitions @@ -2399,7 +2395,6 @@ async def test_replace_stale_shuffle(c, s, a, b): await check_scheduler_cleanup(s) -@pytest.mark.skipif(not PANDAS_GE_200, reason="requires pandas >=2.0") @gen_cluster(client=True) async def test_handle_null_partitions(c, s, a, b): data = [