From f115ad155067727882b683ca6fa7c231621dc965 Mon Sep 17 00:00:00 2001
From: Thomas Kluyver <thomas@kluyver.me.uk>
Date: Wed, 30 Oct 2019 14:28:51 +0000
Subject: [PATCH] Hypothesis tests for roundtrip to & from pandas (#3285)

* Move hypothesis deadline configuration to conftest.py

* Add simple roundtrip test for xarray-pandas-xarray

* Test roundtrip pd.Series->DataArray->Series

* Test roundtrip DataFrame->DataArray->DataFrame

* Test roundtrip Dataset->Dataframe->Dataset

* Relax to allow 0 entries in each dataset var

* Relax to allow empty string names

* Add print_blob to config

* Extra half-roundtrip from pandas series to xarray

* Extra half roundtrip from pandas dataframe to Xarray

* Redesign strategy for generating datasets with 1D variables

Following suggestions from @Zac-HD

* Make pep8 happy

* Autoformat test file

* Skip hypothesis tests if hypothesis not available

* Don't require hypothesis for conftest file

* Mark failing test as xfail
---
 properties/conftest.py              |  8 +++
 properties/test_encode_decode.py    |  7 +--
 properties/test_pandas_roundtrip.py | 97 +++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+), 6 deletions(-)
 create mode 100644 properties/conftest.py
 create mode 100644 properties/test_pandas_roundtrip.py

diff --git a/properties/conftest.py b/properties/conftest.py
new file mode 100644
index 00000000000..0a66d92ebc6
--- /dev/null
+++ b/properties/conftest.py
@@ -0,0 +1,8 @@
+try:
+    from hypothesis import settings
+except ImportError:
+    pass
+else:
+    # Run for a while - arrays are a bigger search space than usual
+    settings.register_profile("ci", deadline=None, print_blob=True)
+    settings.load_profile("ci")
diff --git a/properties/test_encode_decode.py b/properties/test_encode_decode.py
index 011e7a922d1..221083e16a1 100644
--- a/properties/test_encode_decode.py
+++ b/properties/test_encode_decode.py
@@ -10,15 +10,10 @@
 
 import hypothesis.extra.numpy as npst
 import hypothesis.strategies as st
-from hypothesis import given, settings
+from hypothesis import given
 
 import xarray as xr
 
-# Run for a while - arrays are a bigger search space than usual
-settings.register_profile("ci", deadline=None)
-settings.load_profile("ci")
-
-
 an_array = npst.arrays(
     dtype=st.one_of(
         npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes()
diff --git a/properties/test_pandas_roundtrip.py b/properties/test_pandas_roundtrip.py
new file mode 100644
index 00000000000..a8005d319d6
--- /dev/null
+++ b/properties/test_pandas_roundtrip.py
@@ -0,0 +1,97 @@
+"""
+Property-based tests for roundtripping between xarray and pandas objects.
+"""
+import pytest
+
+pytest.importorskip("hypothesis")
+
+from functools import partial
+import hypothesis.extra.numpy as npst
+import hypothesis.extra.pandas as pdst
+import hypothesis.strategies as st
+from hypothesis import given
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+numeric_dtypes = st.one_of(
+    npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes()
+)
+
+numeric_series = numeric_dtypes.flatmap(lambda dt: pdst.series(dtype=dt))
+
+an_array = npst.arrays(
+    dtype=numeric_dtypes,
+    shape=npst.array_shapes(max_dims=2),  # can only convert 1D/2D to pandas
+)
+
+
+@st.composite
+def datasets_1d_vars(draw):
+    """Generate datasets with only 1D variables
+
+    Suitable for converting to pandas dataframes.
+    """
+    # Generate an index for the dataset
+    idx = draw(pdst.indexes(dtype="u8", min_size=0, max_size=100))
+
+    # Generate 1-3 variables, 1D with the same length as the index
+    vars_strategy = st.dictionaries(
+        keys=st.text(),
+        values=npst.arrays(dtype=numeric_dtypes, shape=len(idx)).map(
+            partial(xr.Variable, ("rows",))
+        ),
+        min_size=1,
+        max_size=3,
+    )
+    return xr.Dataset(draw(vars_strategy), coords={"rows": idx})
+
+
+@given(st.data(), an_array)
+def test_roundtrip_dataarray(data, arr):
+    names = data.draw(
+        st.lists(st.text(), min_size=arr.ndim, max_size=arr.ndim, unique=True).map(
+            tuple
+        )
+    )
+    coords = {name: np.arange(n) for (name, n) in zip(names, arr.shape)}
+    original = xr.DataArray(arr, dims=names, coords=coords)
+    roundtripped = xr.DataArray(original.to_pandas())
+    xr.testing.assert_identical(original, roundtripped)
+
+
+@given(datasets_1d_vars())
+def test_roundtrip_dataset(dataset):
+    df = dataset.to_dataframe()
+    assert isinstance(df, pd.DataFrame)
+    roundtripped = xr.Dataset(df)
+    xr.testing.assert_identical(dataset, roundtripped)
+
+
+@given(numeric_series, st.text())
+def test_roundtrip_pandas_series(ser, ix_name):
+    # Need to name the index, otherwise Xarray calls it 'dim_0'.
+    ser.index.name = ix_name
+    arr = xr.DataArray(ser)
+    roundtripped = arr.to_pandas()
+    pd.testing.assert_series_equal(ser, roundtripped)
+    xr.testing.assert_identical(arr, roundtripped.to_xarray())
+
+
+# Dataframes with columns of all the same dtype - for roundtrip to DataArray
+numeric_homogeneous_dataframe = numeric_dtypes.flatmap(
+    lambda dt: pdst.data_frames(columns=pdst.columns(["a", "b", "c"], dtype=dt))
+)
+
+
+@pytest.mark.xfail
+@given(numeric_homogeneous_dataframe)
+def test_roundtrip_pandas_dataframe(df):
+    # Need to name the indexes, otherwise Xarray names them 'dim_0', 'dim_1'.
+    df.index.name = "rows"
+    df.columns.name = "cols"
+    arr = xr.DataArray(df)
+    roundtripped = arr.to_pandas()
+    pd.testing.assert_frame_equal(df, roundtripped)
+    xr.testing.assert_identical(arr, roundtripped.to_xarray())