Skip to content

Commit

Permalink
Merge pull request #66 from anmyachev/compare-columns
Browse files Browse the repository at this point in the history
Add functions to compare Column objects with iterable references and to compare DataFrame objects with mapping references
  • Loading branch information
MarcoGorelli authored Jan 24, 2024
2 parents d8b7766 + a4c4aee commit 107969c
Show file tree
Hide file tree
Showing 31 changed files with 335 additions and 386 deletions.
6 changes: 5 additions & 1 deletion dataframe_api_compat/pandas_standard/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,16 @@ def map_pandas_dtype_to_standard_dtype(dtype: Any) -> DType:
return Namespace.Float32()
if dtype == "Float32":
return Namespace.Float32()
if dtype == "bool":
if dtype in ("bool", "boolean"):
# Also for `pandas.core.arrays.boolean.BooleanDtype`
return Namespace.Bool()
if dtype == "object":
return Namespace.String()
if dtype == "string":
return Namespace.String()
if hasattr(dtype, "name"):
# For types like `numpy.dtypes.DateTime64DType`
dtype = dtype.name
if dtype.startswith("datetime64["):
match = re.search(r"datetime64\[(\w{1,2})", dtype)
assert match is not None
Expand Down
2 changes: 2 additions & 0 deletions dataframe_api_compat/pandas_standard/column_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
"UInt16": "uint16",
"UInt8": "uint8",
"boolean": "bool",
"Float64": "float64",
"Float32": "float32",
}


Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ ignore = [
[tool.ruff.isort]
force-single-line = true

[tool.black]
line-length = 90

[tool.pytest.ini_options]
filterwarnings = [
"error",
Expand Down
28 changes: 13 additions & 15 deletions tests/column/and_or_test.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,44 @@
from __future__ import annotations

import pandas as pd

from tests.utils import bool_dataframe_1
from tests.utils import interchange_to_pandas
from tests.utils import compare_column_with_reference


def test_column_and(library: str) -> None:
df = bool_dataframe_1(library, api_version="2023.09-beta")
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = df.col("b")
result = df.assign((ser & other).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series([True, True, False], name="result")
pd.testing.assert_series_equal(result_pd, expected)
expected = [True, True, False]
compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)


def test_column_or(library: str) -> None:
df = bool_dataframe_1(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = df.col("b")
result = df.assign((ser | other).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series([True, True, True], name="result")
pd.testing.assert_series_equal(result_pd, expected)
expected = [True, True, True]
compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)


def test_column_and_with_scalar(library: str) -> None:
df = bool_dataframe_1(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = True
result = df.assign((other & ser).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series([True, True, False], name="result")
pd.testing.assert_series_equal(result_pd, expected)
expected = [True, True, False]
compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)


def test_column_or_with_scalar(library: str) -> None:
df = bool_dataframe_1(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = True
result = df.assign((other | ser).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series([True, True, True], name="result")
pd.testing.assert_series_equal(result_pd, expected)
expected = [True, True, True]
compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool)
16 changes: 6 additions & 10 deletions tests/column/cast_test.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
import pandas as pd

from tests.utils import compare_dataframe_with_reference
from tests.utils import integer_dataframe_1
from tests.utils import interchange_to_pandas


def test_cast_integers(library: str) -> None:
df = integer_dataframe_1(library)
pdx = df.__dataframe_namespace__()
result = df.assign(df.col("a").cast(pdx.Int32()))
expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).astype(
{"a": "int32", "b": "int64"},
)
result_pd = interchange_to_pandas(result)
pd.testing.assert_frame_equal(result_pd, expected)
ns = df.__dataframe_namespace__()
result = df.assign(df.col("a").cast(ns.Int32()))
expected = {"a": [1, 2, 3], "b": [4, 5, 6]}
expected_dtype = {"a": ns.Int32, "b": ns.Int64}
compare_dataframe_with_reference(result, expected, expected_dtype)
54 changes: 15 additions & 39 deletions tests/column/col_sorted_indices_test.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,42 @@
from __future__ import annotations

import pandas as pd

from tests.utils import compare_dataframe_with_reference
from tests.utils import integer_dataframe_6
from tests.utils import interchange_to_pandas


def test_expression_sorted_indices_ascending(library: str) -> None:
df = integer_dataframe_6(library)
df.__dataframe_namespace__()
ns = df.__dataframe_namespace__()
col = df.col
sorted_indices = col("b").sorted_indices()
result = df.take(sorted_indices)
result_pd = interchange_to_pandas(result)
expected = pd.DataFrame(
{
"a": [2, 2, 1, 1, 1],
"b": [1, 2, 3, 4, 4],
},
)
pd.testing.assert_frame_equal(result_pd, expected)
expected = {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}
compare_dataframe_with_reference(result, expected, dtype=ns.Int64)


def test_expression_sorted_indices_descending(library: str) -> None:
df = integer_dataframe_6(library)
df.__dataframe_namespace__()
ns = df.__dataframe_namespace__()
col = df.col
sorted_indices = col("b").sorted_indices(ascending=False)
result = df.take(sorted_indices)
result_pd = interchange_to_pandas(result)
expected = pd.DataFrame(
{
"a": [1, 1, 1, 2, 2],
"b": [4, 4, 3, 2, 1],
},
)
pd.testing.assert_frame_equal(result_pd, expected)
expected = {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}
compare_dataframe_with_reference(result, expected, dtype=ns.Int64)


def test_column_sorted_indices_ascending(library: str) -> None:
df = integer_dataframe_6(library).persist()
df = integer_dataframe_6(library)
ns = df.__dataframe_namespace__()
sorted_indices = df.col("b").sorted_indices()
result = df.take(sorted_indices)
result_pd = interchange_to_pandas(result)
expected = pd.DataFrame(
{
"a": [2, 2, 1, 1, 1],
"b": [1, 2, 3, 4, 4],
},
)
pd.testing.assert_frame_equal(result_pd, expected)
expected = {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}
compare_dataframe_with_reference(result, expected, dtype=ns.Int64)


def test_column_sorted_indices_descending(library: str) -> None:
df = integer_dataframe_6(library).persist()
df = integer_dataframe_6(library)
ns = df.__dataframe_namespace__()
sorted_indices = df.col("b").sorted_indices(ascending=False)
result = df.take(sorted_indices)
result_pd = interchange_to_pandas(result)
expected = pd.DataFrame(
{
"a": [1, 1, 1, 2, 2],
"b": [4, 4, 3, 2, 1],
},
)
pd.testing.assert_frame_equal(result_pd, expected)
expected = {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}
compare_dataframe_with_reference(result, expected, dtype=ns.Int64)
92 changes: 47 additions & 45 deletions tests/column/comparisons_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,83 +2,86 @@

from typing import Any

import pandas as pd
import pytest

from tests.utils import compare_column_with_reference
from tests.utils import integer_dataframe_1
from tests.utils import integer_dataframe_7
from tests.utils import interchange_to_pandas


@pytest.mark.parametrize(
("comparison", "expected_data"),
("comparison", "expected_data", "expected_dtype"),
[
("__eq__", [True, True, False]),
("__ne__", [False, False, True]),
("__ge__", [True, True, False]),
("__gt__", [False, False, False]),
("__le__", [True, True, True]),
("__lt__", [False, False, True]),
("__add__", [2, 4, 7]),
("__sub__", [0, 0, -1]),
("__mul__", [1, 4, 12]),
("__truediv__", [1, 1, 0.75]),
("__floordiv__", [1, 1, 0]),
("__pow__", [1, 4, 81]),
("__mod__", [0, 0, 3]),
("__eq__", [True, True, False], "Bool"),
("__ne__", [False, False, True], "Bool"),
("__ge__", [True, True, False], "Bool"),
("__gt__", [False, False, False], "Bool"),
("__le__", [True, True, True], "Bool"),
("__lt__", [False, False, True], "Bool"),
("__add__", [2, 4, 7], "Int64"),
("__sub__", [0, 0, -1], "Int64"),
("__mul__", [1, 4, 12], "Int64"),
("__truediv__", [1, 1, 0.75], "Float64"),
("__floordiv__", [1, 1, 0], "Int64"),
("__pow__", [1, 4, 81], "Int64"),
("__mod__", [0, 0, 3], "Int64"),
],
)
def test_column_comparisons(
library: str,
comparison: str,
expected_data: list[object],
expected_dtype: str,
) -> None:
ser: Any
df = integer_dataframe_7(library).persist()
df = integer_dataframe_7(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = df.col("b")
result = df.assign(getattr(ser, comparison)(other).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series(expected_data, name="result")
if library in ("polars", "polars-lazy") and comparison == "__pow__":
expected_ns_dtype = getattr(ns, expected_dtype)
if comparison == "__pow__" and library in ("polars", "polars-lazy"):
# TODO
result_pd = result_pd.astype("int64")
pd.testing.assert_series_equal(result_pd, expected)
result = result.cast({"result": ns.Int64()})
expected_ns_dtype = ns.Int64
compare_column_with_reference(result.col("result"), expected_data, expected_ns_dtype)


@pytest.mark.parametrize(
("comparison", "expected_data"),
("comparison", "expected_data", "expected_dtype"),
[
("__eq__", [False, False, True]),
("__ne__", [True, True, False]),
("__ge__", [False, False, True]),
("__gt__", [False, False, False]),
("__le__", [True, True, True]),
("__lt__", [True, True, False]),
("__add__", [4, 5, 6]),
("__sub__", [-2, -1, 0]),
("__mul__", [3, 6, 9]),
("__truediv__", [1 / 3, 2 / 3, 1]),
("__floordiv__", [0, 0, 1]),
("__pow__", [1, 8, 27]),
("__mod__", [1, 2, 0]),
("__eq__", [False, False, True], "Bool"),
("__ne__", [True, True, False], "Bool"),
("__ge__", [False, False, True], "Bool"),
("__gt__", [False, False, False], "Bool"),
("__le__", [True, True, True], "Bool"),
("__lt__", [True, True, False], "Bool"),
("__add__", [4, 5, 6], "Int64"),
("__sub__", [-2, -1, 0], "Int64"),
("__mul__", [3, 6, 9], "Int64"),
("__truediv__", [1 / 3, 2 / 3, 1], "Float64"),
("__floordiv__", [0, 0, 1], "Int64"),
("__pow__", [1, 8, 27], "Int64"),
("__mod__", [1, 2, 0], "Int64"),
],
)
def test_column_comparisons_scalar(
library: str,
comparison: str,
expected_data: list[object],
expected_dtype: str,
) -> None:
ser: Any
df = integer_dataframe_1(library).persist()
df = integer_dataframe_1(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = 3
result = df.assign(getattr(ser, comparison)(other).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series(expected_data, name="result")
expected_ns_dtype = getattr(ns, expected_dtype)
if comparison == "__pow__" and library in ("polars", "polars-lazy"):
result_pd = result_pd.astype("int64")
pd.testing.assert_series_equal(result_pd, expected)
result = result.cast({"result": ns.Int64()})
expected_ns_dtype = ns.Int64
compare_column_with_reference(result.col("result"), expected_data, expected_ns_dtype)


@pytest.mark.parametrize(
Expand All @@ -96,10 +99,9 @@ def test_right_column_comparisons(
) -> None:
# 1,2,3
ser: Any
df = integer_dataframe_7(library).persist()
df = integer_dataframe_7(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
other = 2
result = df.assign(getattr(ser, comparison)(other).rename("result"))
result_pd = interchange_to_pandas(result)["result"]
expected = pd.Series(expected_data, name="result")
pd.testing.assert_series_equal(result_pd, expected)
compare_column_with_reference(result.col("result"), expected_data, dtype=ns.Int64)
15 changes: 8 additions & 7 deletions tests/column/cumulative_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

import pandas as pd
import pytest
from packaging.version import Version
from packaging.version import parse

from tests.utils import compare_column_with_reference
from tests.utils import integer_dataframe_1
from tests.utils import interchange_to_pandas


@pytest.mark.parametrize(
Expand All @@ -21,17 +23,16 @@ def test_cumulative_functions_column(
func: str,
expected_data: list[float],
) -> None:
df = integer_dataframe_1(library).persist()
df = integer_dataframe_1(library)
ns = df.__dataframe_namespace__()
ser = df.col("a")
expected = pd.Series(expected_data, name="result")
result = df.assign(getattr(ser, func)().rename("result"))
result_pd = interchange_to_pandas(result)["result"]

if (
tuple(int(v) for v in pd.__version__.split(".")) < (2, 0, 0)
and library == "pandas-nullable"
parse(pd.__version__) < Version("2.0.0") and library == "pandas-nullable"
): # pragma: no cover
# Upstream bug
result_pd = result_pd.astype("int64")
result = result.cast({"result": ns.Int64()})

pd.testing.assert_series_equal(result_pd, expected)
compare_column_with_reference(result.col("result"), expected, dtype=ns.Int64)
Loading

0 comments on commit 107969c

Please sign in to comment.