Skip to content

Commit

Permalink
Cast PandasIndex to pd.(Multi)Index (pydata#5385)
Browse files Browse the repository at this point in the history
* fastpath cast Xarray's PandasIndex to pd.Index

Also make sure that a multi-index with one unique level are not cast to
a simple pd.Index

* update tests

* [skip-ci] update what's new
  • Loading branch information
benbovy authored and TomNicholas committed May 28, 2021
1 parent 2a3965c commit 95ebf95
Show file tree
Hide file tree
Showing 6 changed files with 239 additions and 0 deletions.
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ Bug fixes
:py:class:`CFTimeIndex` and upcoming pandas version 1.3.0 (:issue:`5356`,
:pull:`5359`).
By `Spencer Clark <https://github.com/spencerkclark>`_.
- Fix 1-level multi-index incorrectly converted to single index (:issue:`5384`,
:pull:`5385`).
By `Benoit Bovy <https://github.com/benbovy>`_.


Documentation
Expand Down
76 changes: 76 additions & 0 deletions xarray/core/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1724,3 +1724,79 @@ def _calc_idxminmax(
res.attrs = indx.attrs

return res


def hist(*datarrays, bins=None, dim=None, weights=None, density=False):
"""
Histogram applied along specified dimensions.
If the supplied arguments are chunked dask arrays it will use
`dask.array.blockwise` internally to parallelize over all chunks.
Parameters
----------
datarrays : xarray.DataArray objects
Input data. The number of input arguments determines the dimensionality of
the histogram. For example, two arguments produce a 2D histogram.
dim : tuple of strings, optional
Dimensions over which which the histogram is computed. The default is to
compute the histogram of the flattened array. i.e. over all dimensions.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
If a list, there should be one entry for each item in ``args``.
The bin specification:
* If int, the number of bins for all arguments in ``args``.
* If array_like, the bin edges for all arguments in ``args``.
* If a list of ints, the number of bins for every argument in ``args``.
* If a list arrays, the bin edges for each argument in ``args``
(required format for Dask inputs).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.
* If a list of DataArrays, the bins for each argument in ``args``
The DataArrays can be multidimensional, but must not have any
dimensions shared with the `dim` argument.
When bin edges are specified, all but the last (righthand-most) bin include
the left edge and exclude the right edge. The last bin includes both edges.
A ``TypeError`` will be raised if ``args`` contains dask arrays and
``bins`` are not specified explicitly as a list of arrays.
weights : array_like, optional
An array of weights, of the same shape as `a`. Each value in
`a` only contributes its associated weight towards the bin count
(instead of 1). If `density` is True, the weights are
normalized, so that the integral of the density over the range
remains 1. NaNs in the weights input will fill the entire bin with
NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
before running ``hist()``.
density : bool, optional
If ``False``, the result will contain the number of samples in
each bin. If ``True``, the result is the value of the
probability *density* function at the bin, normalized such that
the *integral* over the range is 1. Note that the sum of the
histogram values will not be equal to 1 unless bins of unit
width are chosen; it is not a probability *mass* function.
Returns
-------
hist : xarray.DataArray
A single dataarray which contains the values of the histogram. See
`density` and `weights` for a description of the possible semantics.
The returned dataarray will have one additional coordinate for each
dataarray supplied, named as `var_bins`, which contains the positions
of the centres of each bin.
Examples
--------
See Also
--------
DataArray.hist
Dataset.hist
numpy.histogramdd
dask.array.blockwise
"""

# TODO xhistogram code goes here
raise NotImplementedError
74 changes: 74 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4599,6 +4599,80 @@ def drop_duplicates(
indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)}
return self.isel(indexes)

def hist(self, bins=None, dim=None, weights=None, density=False):
"""
Histogram applied along specified dimensions.
If the supplied arguments are chunked dask arrays it will use
`dask.array.blockwise` internally to parallelize over all chunks.
datarrays : xarray.DataArray objects
Input data. The number of input arguments determines the dimensionality of
the histogram. For example, two arguments produce a 2D histogram.
dim : tuple of strings, optional
Dimensions over which which the histogram is computed. The default is to
compute the histogram of the flattened array. i.e. over all dimensions.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
If a list, there should be one entry for each item in ``args``.
The bin specification:
* If int, the number of bins for all arguments in ``args``.
* If array_like, the bin edges for all arguments in ``args``.
* If a list of ints, the number of bins for every argument in ``args``.
* If a list arrays, the bin edges for each argument in ``args``
(required format for Dask inputs).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.
* If a list of DataArrays, the bins for each argument in ``args``
The DataArrays can be multidimensional, but must not have any
dimensions shared with the `dim` argument.
When bin edges are specified, all but the last (righthand-most) bin include
the left edge and exclude the right edge. The last bin includes both edges.
A ``TypeError`` will be raised if ``args`` contains dask arrays and
``bins`` are not specified explicitly as a list of arrays.
weights : array_like, optional
An array of weights, of the same shape as `a`. Each value in
`a` only contributes its associated weight towards the bin count
(instead of 1). If `density` is True, the weights are
normalized, so that the integral of the density over the range
remains 1. NaNs in the weights input will fill the entire bin with
NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
before running ``hist()``.
density : bool, optional
If ``False``, the result will contain the number of samples in
each bin. If ``True``, the result is the value of the
probability *density* function at the bin, normalized such that
the *integral* over the range is 1. Note that the sum of the
histogram values will not be equal to 1 unless bins of unit
width are chosen; it is not a probability *mass* function.
Returns
-------
hist : xarray.DataArray
A single dataarray which contains the values of the histogram. See
`density` and `weights` for a description of the possible semantics.
The returned dataarray will have one additional coordinate for each
dataarray supplied, named as `var_bins`, which contains the positions
of the centres of each bin.
Examples
--------
See Also
--------
xarray.hist
Dataset.hist
numpy.histogramdd
dask.array.blockwise
"""

return computation.hist(
datarrays=self, dim=dim, bins=bins, weights=weights, density=density
)

# this needs to be at the end, or mypy will confuse with `str`
# https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
str = utils.UncachedAccessor(StringAccessor)
81 changes: 81 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7627,3 +7627,84 @@ def _wrapper(Y, *coords_, **kwargs):
result.attrs = self.attrs.copy()

return result

def hist(self, vars=None, bins=None, dim=None, weights=None, density=False):
"""
Histogram applied along specified dimensions.
If the supplied arguments are chunked dask arrays it will use
`dask.array.blockwise` internally to parallelize over all chunks.
vars : list of str
Variables on the Dataset to use as input data. The number of variables
determines the dimensionality of the histogram. For example, two
arguments produce a 2D histogram.
dim : tuple of strings, optional
Dimensions over which which the histogram is computed. The default is to
compute the histogram of the flattened array. i.e. over all dimensions.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
If a list, there should be one entry for each item in ``args``.
The bin specification:
* If int, the number of bins for all arguments in ``args``.
* If array_like, the bin edges for all arguments in ``args``.
* If a list of ints, the number of bins for every argument in ``args``.
* If a list arrays, the bin edges for each argument in ``args``
(required format for Dask inputs).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.
* If a list of DataArrays, the bins for each argument in ``args``
The DataArrays can be multidimensional, but must not have any
dimensions shared with the `dim` argument.
When bin edges are specified, all but the last (righthand-most) bin include
the left edge and exclude the right edge. The last bin includes both edges.
A ``TypeError`` will be raised if ``args`` contains dask arrays and
``bins`` are not specified explicitly as a list of arrays.
weights : array_like, optional
An array of weights, of the same shape as `a`. Each value in
`a` only contributes its associated weight towards the bin count
(instead of 1). If `density` is True, the weights are
normalized, so that the integral of the density over the range
remains 1. NaNs in the weights input will fill the entire bin with
NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
before running ``hist()``.
density : bool, optional
If ``False``, the result will contain the number of samples in
each bin. If ``True``, the result is the value of the
probability *density* function at the bin, normalized such that
the *integral* over the range is 1. Note that the sum of the
histogram values will not be equal to 1 unless bins of unit
width are chosen; it is not a probability *mass* function.
Returns
-------
hist : xarray.DataArray
A single dataarray which contains the values of the histogram. See
`density` and `weights` for a description of the possible semantics.
The returned dataarray will have one additional coordinate for each
dataarray supplied, named as `var_bins`, which contains the positions
of the centres of each bin.
Examples
--------
See Also
--------
xarray.hist
DataArray.hist
numpy.histogramdd
dask.array.blockwise
"""

from .computation import hist

return hist(
datarrays=[self[var] for var in vars],
dim=dim,
bins=bins,
weights=weights,
density=density,
)
2 changes: 2 additions & 0 deletions xarray/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def safe_cast_to_index(array: Any) -> pd.Index:
index = array
elif hasattr(array, "to_index"):
index = array.to_index()
elif hasattr(array, "to_pandas_index"):
index = array.to_pandas_index()
else:
kwargs = {}
if hasattr(array, "dtype") and array.dtype.kind == "O":
Expand Down
3 changes: 3 additions & 0 deletions xarray/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from xarray.coding.cftimeindex import CFTimeIndex
from xarray.core import duck_array_ops, utils
from xarray.core.indexes import PandasIndex
from xarray.core.utils import either_dict_or_kwargs

from . import assert_array_equal, requires_cftime, requires_dask
Expand All @@ -28,11 +29,13 @@ def test_safe_cast_to_index():
dates = pd.date_range("2000-01-01", periods=10)
x = np.arange(5)
td = x * np.timedelta64(1, "D")
midx = pd.MultiIndex.from_tuples([(0,)], names=["a"])
for expected, array in [
(dates, dates.values),
(pd.Index(x, dtype=object), x.astype(object)),
(pd.Index(td), td),
(pd.Index(td, dtype=object), td.astype(object)),
(midx, PandasIndex(midx)),
]:
actual = utils.safe_cast_to_index(array)
assert_array_equal(expected, actual)
Expand Down

0 comments on commit 95ebf95

Please sign in to comment.