Merge pull request #66 from anmyachev/compare-columns

Add functions to compare Column objects with iterable references and to compare DataFrame objects with mapping references
data-apis · Jan 24, 2024 · 107969c · 107969c
2 parents d8b7766 + a4c4aee
commit 107969c
Show file tree

Hide file tree

Showing 31 changed files with 335 additions and 386 deletions.
diff --git a/dataframe_api_compat/pandas_standard/__init__.py b/dataframe_api_compat/pandas_standard/__init__.py
@@ -104,12 +104,16 @@ def map_pandas_dtype_to_standard_dtype(dtype: Any) -> DType:
         return Namespace.Float32()
     if dtype == "Float32":
         return Namespace.Float32()
-    if dtype == "bool":
+    if dtype in ("bool", "boolean"):
+        # Also for `pandas.core.arrays.boolean.BooleanDtype`
         return Namespace.Bool()
     if dtype == "object":
         return Namespace.String()
     if dtype == "string":
         return Namespace.String()
+    if hasattr(dtype, "name"):
+        # For types like `numpy.dtypes.DateTime64DType`
+        dtype = dtype.name
     if dtype.startswith("datetime64["):
         match = re.search(r"datetime64\[(\w{1,2})", dtype)
         assert match is not None

diff --git a/dataframe_api_compat/pandas_standard/column_object.py b/dataframe_api_compat/pandas_standard/column_object.py
@@ -35,6 +35,8 @@
     "UInt16": "uint16",
     "UInt8": "uint8",
     "boolean": "bool",
+    "Float64": "float64",
+    "Float32": "float32",
 }
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -75,6 +75,9 @@ ignore = [
 [tool.ruff.isort]
 force-single-line = true
 
+[tool.black]
+line-length = 90
+
 [tool.pytest.ini_options]
 filterwarnings = [
   "error",

diff --git a/tests/column/and_or_test.py b/tests/column/and_or_test.py
@@ -1,46 +1,44 @@
 from __future__ import annotations
 
-import pandas as pd
-
 from tests.utils import bool_dataframe_1
-from tests.utils import interchange_to_pandas
+from tests.utils import compare_column_with_reference
 
 
 def test_column_and(library: str) -> None:
     df = bool_dataframe_1(library, api_version="2023.09-beta")
+    ns = df.__dataframe_namespace__()
     ser = df.col("a")
     other = df.col("b")
     result = df.assign((ser & other).rename("result"))
-    result_pd = interchange_to_pandas(result)["result"]
-    expected = pd.Series([True, True, False], name="result")
-    pd.testing.assert_series_equal(result_pd, expected)
+    expected = [True, True, False]
+    compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)
 
 
 def test_column_or(library: str) -> None:
     df = bool_dataframe_1(library)
+    ns = df.__dataframe_namespace__()
     ser = df.col("a")
     other = df.col("b")
     result = df.assign((ser | other).rename("result"))
-    result_pd = interchange_to_pandas(result)["result"]
-    expected = pd.Series([True, True, True], name="result")
-    pd.testing.assert_series_equal(result_pd, expected)
+    expected = [True, True, True]
+    compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)
 
 
 def test_column_and_with_scalar(library: str) -> None:
     df = bool_dataframe_1(library)
+    ns = df.__dataframe_namespace__()
     ser = df.col("a")
     other = True
     result = df.assign((other & ser).rename("result"))
-    result_pd = interchange_to_pandas(result)["result"]
-    expected = pd.Series([True, True, False], name="result")
-    pd.testing.assert_series_equal(result_pd, expected)
+    expected = [True, True, False]
+    compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)
 
 
 def test_column_or_with_scalar(library: str) -> None:
     df = bool_dataframe_1(library)
+    ns = df.__dataframe_namespace__()
     ser = df.col("a")
     other = True
     result = df.assign((other | ser).rename("result"))
-    result_pd = interchange_to_pandas(result)["result"]
-    expected = pd.Series([True, True, True], name="result")
-    pd.testing.assert_series_equal(result_pd, expected)
+    expected = [True, True, True]
+    compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)
diff --git a/tests/column/cast_test.py b/tests/column/cast_test.py
@@ -1,15 +1,11 @@
-import pandas as pd
-
+from tests.utils import compare_dataframe_with_reference
 from tests.utils import integer_dataframe_1
-from tests.utils import interchange_to_pandas
 
 
 def test_cast_integers(library: str) -> None:
     df = integer_dataframe_1(library)
-    pdx = df.__dataframe_namespace__()
-    result = df.assign(df.col("a").cast(pdx.Int32()))
-    expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).astype(
-        {"a": "int32", "b": "int64"},
-    )
-    result_pd = interchange_to_pandas(result)
-    pd.testing.assert_frame_equal(result_pd, expected)
+    ns = df.__dataframe_namespace__()
+    result = df.assign(df.col("a").cast(ns.Int32()))
+    expected = {"a": [1, 2, 3], "b": [4, 5, 6]}
+    expected_dtype = {"a": ns.Int32, "b": ns.Int64}
+    compare_dataframe_with_reference(result, expected, expected_dtype)
diff --git a/tests/column/col_sorted_indices_test.py b/tests/column/col_sorted_indices_test.py
@@ -1,66 +1,42 @@
 from __future__ import annotations
 
-import pandas as pd
-
+from tests.utils import compare_dataframe_with_reference
 from tests.utils import integer_dataframe_6
-from tests.utils import interchange_to_pandas
 
 
 def test_expression_sorted_indices_ascending(library: str) -> None:
     df = integer_dataframe_6(library)
-    df.__dataframe_namespace__()
+    ns = df.__dataframe_namespace__()
     col = df.col
     sorted_indices = col("b").sorted_indices()
     result = df.take(sorted_indices)
-    result_pd = interchange_to_pandas(result)
-    expected = pd.DataFrame(
-        {
-            "a": [2, 2, 1, 1, 1],
-            "b": [1, 2, 3, 4, 4],
-        },
-    )
-    pd.testing.assert_frame_equal(result_pd, expected)
+    expected = {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}
+    compare_dataframe_with_reference(result, expected, dtype=ns.Int64)
 
 
 def test_expression_sorted_indices_descending(library: str) -> None:
     df = integer_dataframe_6(library)
-    df.__dataframe_namespace__()
+    ns = df.__dataframe_namespace__()
     col = df.col
     sorted_indices = col("b").sorted_indices(ascending=False)
     result = df.take(sorted_indices)
-    result_pd = interchange_to_pandas(result)
-    expected = pd.DataFrame(
-        {
-            "a": [1, 1, 1, 2, 2],
-            "b": [4, 4, 3, 2, 1],
-        },
-    )
-    pd.testing.assert_frame_equal(result_pd, expected)
+    expected = {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}
+    compare_dataframe_with_reference(result, expected, dtype=ns.Int64)
 
 
 def test_column_sorted_indices_ascending(library: str) -> None:
-    df = integer_dataframe_6(library).persist()
+    df = integer_dataframe_6(library)
+    ns = df.__dataframe_namespace__()
     sorted_indices = df.col("b").sorted_indices()
     result = df.take(sorted_indices)
-    result_pd = interchange_to_pandas(result)
-    expected = pd.DataFrame(
-        {
-            "a": [2, 2, 1, 1, 1],
-            "b": [1, 2, 3, 4, 4],
-        },
-    )
-    pd.testing.assert_frame_equal(result_pd, expected)
+    expected = {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}
+    compare_dataframe_with_reference(result, expected, dtype=ns.Int64)
 
 
 def test_column_sorted_indices_descending(library: str) -> None:
-    df = integer_dataframe_6(library).persist()
+    df = integer_dataframe_6(library)
+    ns = df.__dataframe_namespace__()
     sorted_indices = df.col("b").sorted_indices(ascending=False)
     result = df.take(sorted_indices)
-    result_pd = interchange_to_pandas(result)
-    expected = pd.DataFrame(
-        {
-            "a": [1, 1, 1, 2, 2],
-            "b": [4, 4, 3, 2, 1],
-        },
-    )
-    pd.testing.assert_frame_equal(result_pd, expected)
+    expected = {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}
+    compare_dataframe_with_reference(result, expected, dtype=ns.Int64)
diff --git a/tests/column/comparisons_test.py b/tests/column/comparisons_test.py
@@ -2,83 +2,86 @@
 
 from typing import Any
 
-import pandas as pd
 import pytest
 
+from tests.utils import compare_column_with_reference
 from tests.utils import integer_dataframe_1
 from tests.utils import integer_dataframe_7
-from tests.utils import interchange_to_pandas
 
 
 @pytest.mark.parametrize(
-    ("comparison", "expected_data"),
+    ("comparison", "expected_data", "expected_dtype"),
     [
-        ("__eq__", [True, True, False]),
-        ("__ne__", [False, False, True]),
-        ("__ge__", [True, True, False]),
-        ("__gt__", [False, False, False]),
-        ("__le__", [True, True, True]),
-        ("__lt__", [False, False, True]),
-        ("__add__", [2, 4, 7]),
-        ("__sub__", [0, 0, -1]),
-        ("__mul__", [1, 4, 12]),
-        ("__truediv__", [1, 1, 0.75]),
-        ("__floordiv__", [1, 1, 0]),
-        ("__pow__", [1, 4, 81]),
-        ("__mod__", [0, 0, 3]),
+        ("__eq__", [True, True, False], "Bool"),
+        ("__ne__", [False, False, True], "Bool"),
+        ("__ge__", [True, True, False], "Bool"),
+        ("__gt__", [False, False, False], "Bool"),
+        ("__le__", [True, True, True], "Bool"),
+        ("__lt__", [False, False, True], "Bool"),
+        ("__add__", [2, 4, 7], "Int64"),
+        ("__sub__", [0, 0, -1], "Int64"),
+        ("__mul__", [1, 4, 12], "Int64"),
+        ("__truediv__", [1, 1, 0.75], "Float64"),
+        ("__floordiv__", [1, 1, 0], "Int64"),
+        ("__pow__", [1, 4, 81], "Int64"),
+        ("__mod__", [0, 0, 3], "Int64"),
     ],
 )
 def test_column_comparisons(
     library: str,
     comparison: str,
     expected_data: list[object],
+    expected_dtype: str,
 ) -> None:
     ser: Any
-    df = integer_dataframe_7(library).persist()
+    df = integer_dataframe_7(library)
+    ns = df.__dataframe_namespace__()
     ser = df.col("a")
     other = df.col("b")
     result = df.assign(getattr(ser, comparison)(other).rename("result"))
-    result_pd = interchange_to_pandas(result)["result"]
-    expected = pd.Series(expected_data, name="result")
-    if library in ("polars", "polars-lazy") and comparison == "__pow__":
+    expected_ns_dtype = getattr(ns, expected_dtype)
+    if comparison == "__pow__" and library in ("polars", "polars-lazy"):
         # TODO
-        result_pd = result_pd.astype("int64")
-    pd.testing.assert_series_equal(result_pd, expected)
+        result = result.cast({"result": ns.Int64()})
+        expected_ns_dtype = ns.Int64
+    compare_column_with_reference(result.col("result"), expected_data, expected_ns_dtype)
 
 
 @pytest.mark.parametrize(
-    ("comparison", "expected_data"),
+    ("comparison", "expected_data", "expected_dtype"),
     [
-        ("__eq__", [False, False, True]),
-        ("__ne__", [True, True, False]),
-        ("__ge__", [False, False, True]),
-        ("__gt__", [False, False, False]),
-        ("__le__", [True, True, True]),
-        ("__lt__", [True, True, False]),
-        ("__add__", [4, 5, 6]),
-        ("__sub__", [-2, -1, 0]),
-        ("__mul__", [3, 6, 9]),
-        ("__truediv__", [1 / 3, 2 / 3, 1]),
-        ("__floordiv__", [0, 0, 1]),
-        ("__pow__", [1, 8, 27]),
-        ("__mod__", [1, 2, 0]),
+        ("__eq__", [False, False, True], "Bool"),
+        ("__ne__", [True, True, False], "Bool"),
+        ("__ge__", [False, False, True], "Bool"),
+        ("__gt__", [False, False, False], "Bool"),
+        ("__le__", [True, True, True], "Bool"),
+        ("__lt__", [True, True, False], "Bool"),
+        ("__add__", [4, 5, 6], "Int64"),
+        ("__sub__", [-2, -1, 0], "Int64"),
+        ("__mul__", [3, 6, 9], "Int64"),
+        ("__truediv__", [1 / 3, 2 / 3, 1], "Float64"),
+        ("__floordiv__", [0, 0, 1], "Int64"),
+        ("__pow__", [1, 8, 27], "Int64"),
+        ("__mod__", [1, 2, 0], "Int64"),
     ],
 )
 def test_column_comparisons_scalar(
     library: str,
     comparison: str,
     expected_data: list[object],
+    expected_dtype: str,
 ) -> None:
     ser: Any
-    df = integer_dataframe_1(library).persist()
+    df = integer_dataframe_1(library)
+    ns = df.__dataframe_namespace__()
     ser = df.col("a")
     other = 3
     result = df.assign(getattr(ser, comparison)(other).rename("result"))
-    result_pd = interchange_to_pandas(result)["result"]
-    expected = pd.Series(expected_data, name="result")
+    expected_ns_dtype = getattr(ns, expected_dtype)
     if comparison == "__pow__" and library in ("polars", "polars-lazy"):
-        result_pd = result_pd.astype("int64")
-    pd.testing.assert_series_equal(result_pd, expected)
+        result = result.cast({"result": ns.Int64()})
+        expected_ns_dtype = ns.Int64
+    compare_column_with_reference(result.col("result"), expected_data, expected_ns_dtype)
 
 
 @pytest.mark.parametrize(
@@ -96,10 +99,9 @@ def test_right_column_comparisons(
 ) -> None:
     # 1,2,3
     ser: Any
-    df = integer_dataframe_7(library).persist()
+    df = integer_dataframe_7(library)
+    ns = df.__dataframe_namespace__()
     ser = df.col("a")
     other = 2
     result = df.assign(getattr(ser, comparison)(other).rename("result"))
-    result_pd = interchange_to_pandas(result)["result"]
-    expected = pd.Series(expected_data, name="result")
-    pd.testing.assert_series_equal(result_pd, expected)
+    compare_column_with_reference(result.col("result"), expected_data, dtype=ns.Int64)
diff --git a/tests/column/cumulative_test.py b/tests/column/cumulative_test.py
@@ -2,9 +2,11 @@
 
 import pandas as pd
 import pytest
+from packaging.version import Version
+from packaging.version import parse
 
+from tests.utils import compare_column_with_reference
 from tests.utils import integer_dataframe_1
-from tests.utils import interchange_to_pandas
 
 
 @pytest.mark.parametrize(
@@ -21,17 +23,16 @@ def test_cumulative_functions_column(
     func: str,
     expected_data: list[float],
 ) -> None:
-    df = integer_dataframe_1(library).persist()
+    df = integer_dataframe_1(library)
+    ns = df.__dataframe_namespace__()
     ser = df.col("a")
     expected = pd.Series(expected_data, name="result")
     result = df.assign(getattr(ser, func)().rename("result"))
-    result_pd = interchange_to_pandas(result)["result"]
 
     if (
-        tuple(int(v) for v in pd.__version__.split(".")) < (2, 0, 0)
-        and library == "pandas-nullable"
+        parse(pd.__version__) < Version("2.0.0") and library == "pandas-nullable"
     ):  # pragma: no cover
         # Upstream bug
-        result_pd = result_pd.astype("int64")
+        result = result.cast({"result": ns.Int64()})
 
-    pd.testing.assert_series_equal(result_pd, expected)
+    compare_column_with_reference(result.col("result"), expected, dtype=ns.Int64)