Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support explicitly setting a dimension order with to_dataframe() #4333

Merged
merged 6 commits into from
Aug 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ New Features
of :py:class:`DataArray` and :py:class:`Dataset` objects and
document the new method in :doc:`internals`. (:pull:`4248`).
By `Justus Magin <https://github.com/keewis>`_.
- :py:meth:`~xarray.DataArray.to_dataframe` and :py:meth:`~xarray.Dataset.to_dataframe`
now accept a ``dim_order`` parameter allowing to specify the resulting dataframe's
dimensions order (:issue:`4331`, :pull:`4333`).
By `Thomas Zilio <https://github.com/thomas-z>`_.


Bug fixes
Expand Down
34 changes: 31 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -2365,13 +2365,36 @@ def to_pandas(self) -> Union["DataArray", pd.Series, pd.DataFrame]:
indexes = [self.get_index(dim) for dim in self.dims]
return constructor(self.values, *indexes)

def to_dataframe(self, name: Hashable = None) -> pd.DataFrame:
def to_dataframe(
self, name: Hashable = None, dim_order: List[Hashable] = None
) -> pd.DataFrame:
"""Convert this array and its coordinates into a tidy pandas.DataFrame.

The DataFrame is indexed by the Cartesian product of index coordinates
(in the form of a :py:class:`pandas.MultiIndex`).

Other coordinates are included as columns in the DataFrame.

Parameters
----------
name
Name to give to this array (required if unnamed).
dim_order
Hierarchical dimension order for the resulting dataframe.
Array content is transposed to this order and then written out as flat
vectors in contiguous order, so the last dimension in this list
will be contiguous in the resulting DataFrame. This has a major
influence on which operations are efficient on the resulting
dataframe.

If provided, must include all dimensions of this DataArray. By default,
dimensions are sorted according to the DataArray dimensions order.

Returns
-------
result
DataArray as a pandas DataFrame.

"""
if name is None:
name = self.name
Expand All @@ -2381,15 +2404,20 @@ def to_dataframe(self, name: Hashable = None) -> pd.DataFrame:
"DataFrame: use the ``name`` parameter"
)

dims = dict(zip(self.dims, self.shape))
# By using a unique name, we can convert a DataArray into a DataFrame
# even if it shares a name with one of its coordinates.
# I would normally use unique_name = object() but that results in a
# dataframe with columns in the wrong order, for reasons I have not
# been able to debug (possibly a pandas bug?).
unique_name = "__unique_name_identifier_z98xfz98xugfg73ho__"
ds = self._to_dataset_whole(name=unique_name)
df = ds._to_dataframe(dims)

if dim_order is None:
dcherian marked this conversation as resolved.
Show resolved Hide resolved
ordered_dims = dict(zip(self.dims, self.shape))
else:
ordered_dims = ds._normalize_dim_order(dim_order=dim_order)

df = ds._to_dataframe(ordered_dims)
df.columns = [name if c == unique_name else c for c in df.columns]
return df

Expand Down
78 changes: 62 additions & 16 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4524,23 +4524,75 @@ def to_array(self, dim="variable", name=None):
data, coords, dims, attrs=self.attrs, name=name, indexes=indexes
)

def _to_dataframe(self, ordered_dims):
def _normalize_dim_order(
self, dim_order: List[Hashable] = None
) -> Dict[Hashable, int]:
"""
Check the validity of the provided dimensions if any and return the mapping
between dimension name and their size.

Parameters
----------
dim_order
Dimension order to validate (default to the alphabetical order if None).

Returns
-------
result
Validated dimensions mapping.

"""
if dim_order is None:
dim_order = list(self.dims)
elif set(dim_order) != set(self.dims):
raise ValueError(
"dim_order {} does not match the set of dimensions of this "
"Dataset: {}".format(dim_order, list(self.dims))
)

ordered_dims = {k: self.dims[k] for k in dim_order}

return ordered_dims

def _to_dataframe(self, ordered_dims: Mapping[Hashable, int]):
columns = [k for k in self.variables if k not in self.dims]
data = [
self._variables[k].set_dims(ordered_dims).values.reshape(-1)
for k in columns
]
index = self.coords.to_index(ordered_dims)
index = self.coords.to_index([*ordered_dims])
return pd.DataFrame(dict(zip(columns, data)), index=index)

def to_dataframe(self):
def to_dataframe(self, dim_order: List[Hashable] = None) -> pd.DataFrame:
"""Convert this dataset into a pandas.DataFrame.

Non-index variables in this dataset form the columns of the
DataFrame. The DataFrame is be indexed by the Cartesian product of
DataFrame. The DataFrame is indexed by the Cartesian product of
this dataset's indices.

Parameters
----------
dim_order
Hierarchical dimension order for the resulting dataframe. All
arrays are transposed to this order and then written out as flat
vectors in contiguous order, so the last dimension in this list
will be contiguous in the resulting DataFrame. This has a major
influence on which operations are efficient on the resulting
dataframe.

If provided, must include all dimensions of this dataset. By
default, dimensions are sorted alphabetically.

Returns
-------
result
Dataset as a pandas DataFrame.

"""
return self._to_dataframe(self.dims)

ordered_dims = self._normalize_dim_order(dim_order=dim_order)

return self._to_dataframe(ordered_dims=ordered_dims)

def _set_sparse_data_from_dataframe(
self, idx: pd.Index, arrays: List[Tuple[Hashable, np.ndarray]], dims: tuple
Expand Down Expand Up @@ -4694,11 +4746,11 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
influence on which operations are efficient on the resulting dask
dataframe.

If provided, must include all dimensions on this dataset. By
If provided, must include all dimensions of this dataset. By
default, dimensions are sorted alphabetically.
set_index : bool, optional
If set_index=True, the dask DataFrame is indexed by this dataset's
coordinate. Since dask DataFrames to not support multi-indexes,
coordinate. Since dask DataFrames do not support multi-indexes,
set_index only works if the dataset only contains one dimension.

Returns
Expand All @@ -4709,15 +4761,7 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
import dask.array as da
import dask.dataframe as dd

if dim_order is None:
dim_order = list(self.dims)
elif set(dim_order) != set(self.dims):
raise ValueError(
"dim_order {} does not match the set of dimensions on this "
"Dataset: {}".format(dim_order, list(self.dims))
)

ordered_dims = {k: self.dims[k] for k in dim_order}
ordered_dims = self._normalize_dim_order(dim_order=dim_order)

columns = list(ordered_dims)
columns.extend(k for k in self.coords if k not in self.dims)
Expand All @@ -4744,6 +4788,8 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
df = dd.concat(series_list, axis=1)

if set_index:
dim_order = [*ordered_dims]

if len(dim_order) == 1:
(dim,) = dim_order
df = df.set_index(dim)
Expand Down
12 changes: 9 additions & 3 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3463,15 +3463,18 @@ def test_to_pandas(self):

def test_to_dataframe(self):
# regression test for #260
arr = DataArray(
np.random.randn(3, 4), [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo"
)
arr_np = np.random.randn(3, 4)

arr = DataArray(arr_np, [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo")
expected = arr.to_series()
actual = arr.to_dataframe()["foo"]
assert_array_equal(expected.values, actual.values)
assert_array_equal(expected.name, actual.name)
assert_array_equal(expected.index.values, actual.index.values)

actual = arr.to_dataframe(dim_order=["A", "B"])["foo"]
assert_array_equal(arr_np.transpose().reshape(-1), actual.values)

# regression test for coords with different dimensions
arr.coords["C"] = ("B", [-1, -2, -3])
expected = arr.to_series().to_frame()
Expand All @@ -3482,6 +3485,9 @@ def test_to_dataframe(self):
assert_array_equal(expected.columns.values, actual.columns.values)
assert_array_equal(expected.index.values, actual.index.values)

with pytest.raises(ValueError, match="does not match the set of dimensions"):
arr.to_dataframe(dim_order=["B", "A", "C"])

arr.name = None # unnamed
with raises_regex(ValueError, "unnamed"):
arr.to_dataframe()
Expand Down
27 changes: 27 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3939,6 +3939,33 @@ def test_to_and_from_dataframe(self):
# check roundtrip
assert_identical(ds.assign_coords(x=[0, 1]), Dataset.from_dataframe(actual))

# Check multiindex reordering
new_order = ["x", "y"]
actual = ds.to_dataframe(dim_order=new_order)
assert expected.equals(actual)

new_order = ["y", "x"]
exp_index = pd.MultiIndex.from_arrays(
[["a", "a", "b", "b", "c", "c"], [0, 1, 0, 1, 0, 1]], names=["y", "x"]
)
expected = pd.DataFrame(
w.transpose().reshape(-1), columns=["w"], index=exp_index
)
actual = ds.to_dataframe(dim_order=new_order)
assert expected.equals(actual)

invalid_order = ["x"]
with pytest.raises(
ValueError, match="does not match the set of dimensions of this"
):
ds.to_dataframe(dim_order=invalid_order)

invalid_order = ["x", "z"]
with pytest.raises(
ValueError, match="does not match the set of dimensions of this"
):
ds.to_dataframe(dim_order=invalid_order)

# check pathological cases
df = pd.DataFrame([1])
actual = Dataset.from_dataframe(df)
Expand Down