From f115ad155067727882b683ca6fa7c231621dc965 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Wed, 30 Oct 2019 14:28:51 +0000 Subject: [PATCH] Hypothesis tests for roundtrip to & from pandas (#3285) * Move hypothesis deadline configuration to conftest.py * Add simple roundtrip test for xarray-pandas-xarray * Test roundtrip pd.Series->DataArray->Series * Test roundtrip DataFrame->DataArray->DataFrame * Test roundtrip Dataset->Dataframe->Dataset * Relax to allow 0 entries in each dataset var * Relax to allow empty string names * Add print_blob to config * Extra half-roundtrip from pandas series to xarray * Extra half roundtrip from pandas dataframe to Xarray * Redesign strategy for generating datasets with 1D variables Following suggestions from @Zac-HD * Make pep8 happy * Autoformat test file * Skip hypothesis tests if hypothesis not available * Don't require hypothesis for conftest file * Mark failing test as xfail --- properties/conftest.py | 8 +++ properties/test_encode_decode.py | 7 +-- properties/test_pandas_roundtrip.py | 97 +++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 6 deletions(-) create mode 100644 properties/conftest.py create mode 100644 properties/test_pandas_roundtrip.py diff --git a/properties/conftest.py b/properties/conftest.py new file mode 100644 index 00000000000..0a66d92ebc6 --- /dev/null +++ b/properties/conftest.py @@ -0,0 +1,8 @@ +try: + from hypothesis import settings +except ImportError: + pass +else: + # Run for a while - arrays are a bigger search space than usual + settings.register_profile("ci", deadline=None, print_blob=True) + settings.load_profile("ci") diff --git a/properties/test_encode_decode.py b/properties/test_encode_decode.py index 011e7a922d1..221083e16a1 100644 --- a/properties/test_encode_decode.py +++ b/properties/test_encode_decode.py @@ -10,15 +10,10 @@ import hypothesis.extra.numpy as npst import hypothesis.strategies as st -from hypothesis import given, settings +from hypothesis import given import xarray as xr -# Run for a while - arrays are a bigger search space than usual -settings.register_profile("ci", deadline=None) -settings.load_profile("ci") - - an_array = npst.arrays( dtype=st.one_of( npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes() diff --git a/properties/test_pandas_roundtrip.py b/properties/test_pandas_roundtrip.py new file mode 100644 index 00000000000..a8005d319d6 --- /dev/null +++ b/properties/test_pandas_roundtrip.py @@ -0,0 +1,97 @@ +""" +Property-based tests for roundtripping between xarray and pandas objects. +""" +import pytest + +pytest.importorskip("hypothesis") + +from functools import partial +import hypothesis.extra.numpy as npst +import hypothesis.extra.pandas as pdst +import hypothesis.strategies as st +from hypothesis import given + +import numpy as np +import pandas as pd +import xarray as xr + +numeric_dtypes = st.one_of( + npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes() +) + +numeric_series = numeric_dtypes.flatmap(lambda dt: pdst.series(dtype=dt)) + +an_array = npst.arrays( + dtype=numeric_dtypes, + shape=npst.array_shapes(max_dims=2), # can only convert 1D/2D to pandas +) + + +@st.composite +def datasets_1d_vars(draw): + """Generate datasets with only 1D variables + + Suitable for converting to pandas dataframes. + """ + # Generate an index for the dataset + idx = draw(pdst.indexes(dtype="u8", min_size=0, max_size=100)) + + # Generate 1-3 variables, 1D with the same length as the index + vars_strategy = st.dictionaries( + keys=st.text(), + values=npst.arrays(dtype=numeric_dtypes, shape=len(idx)).map( + partial(xr.Variable, ("rows",)) + ), + min_size=1, + max_size=3, + ) + return xr.Dataset(draw(vars_strategy), coords={"rows": idx}) + + +@given(st.data(), an_array) +def test_roundtrip_dataarray(data, arr): + names = data.draw( + st.lists(st.text(), min_size=arr.ndim, max_size=arr.ndim, unique=True).map( + tuple + ) + ) + coords = {name: np.arange(n) for (name, n) in zip(names, arr.shape)} + original = xr.DataArray(arr, dims=names, coords=coords) + roundtripped = xr.DataArray(original.to_pandas()) + xr.testing.assert_identical(original, roundtripped) + + +@given(datasets_1d_vars()) +def test_roundtrip_dataset(dataset): + df = dataset.to_dataframe() + assert isinstance(df, pd.DataFrame) + roundtripped = xr.Dataset(df) + xr.testing.assert_identical(dataset, roundtripped) + + +@given(numeric_series, st.text()) +def test_roundtrip_pandas_series(ser, ix_name): + # Need to name the index, otherwise Xarray calls it 'dim_0'. + ser.index.name = ix_name + arr = xr.DataArray(ser) + roundtripped = arr.to_pandas() + pd.testing.assert_series_equal(ser, roundtripped) + xr.testing.assert_identical(arr, roundtripped.to_xarray()) + + +# Dataframes with columns of all the same dtype - for roundtrip to DataArray +numeric_homogeneous_dataframe = numeric_dtypes.flatmap( + lambda dt: pdst.data_frames(columns=pdst.columns(["a", "b", "c"], dtype=dt)) +) + + +@pytest.mark.xfail +@given(numeric_homogeneous_dataframe) +def test_roundtrip_pandas_dataframe(df): + # Need to name the indexes, otherwise Xarray names them 'dim_0', 'dim_1'. + df.index.name = "rows" + df.columns.name = "cols" + arr = xr.DataArray(df) + roundtripped = arr.to_pandas() + pd.testing.assert_frame_equal(df, roundtripped) + xr.testing.assert_identical(arr, roundtripped.to_xarray())