pydata · TomNicholas · May 28, 2021 · May 28, 2021
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -44,6 +44,9 @@ Bug fixes
   :py:class:`CFTimeIndex` and upcoming pandas version 1.3.0 (:issue:`5356`,
   :pull:`5359`).
   By `Spencer Clark <https://github.com/spencerkclark>`_.
+- Fix 1-level multi-index incorrectly converted to single index (:issue:`5384`,
+  :pull:`5385`).
+  By `Benoit Bovy <https://github.com/benbovy>`_.
 
 
 Documentation

diff --git a/xarray/core/computation.py b/xarray/core/computation.py
@@ -1724,3 +1724,79 @@ def _calc_idxminmax(
     res.attrs = indx.attrs
 
     return res
+
+
+def hist(*datarrays, bins=None, dim=None, weights=None, density=False):
+    """
+    Histogram applied along specified dimensions.
+
+    If the supplied arguments are chunked dask arrays it will use
+    `dask.array.blockwise` internally to parallelize over all chunks.
+
+    Parameters
+    ----------
+    datarrays : xarray.DataArray objects
+        Input data. The number of input arguments determines the dimensionality of
+        the histogram. For example, two arguments produce a 2D histogram.
+    dim : tuple of strings, optional
+        Dimensions over which which the histogram is computed. The default is to
+        compute the histogram of the flattened array. i.e. over all dimensions.
+    bins :  int or array_like or a list of ints or arrays, or list of DataArrays, optional
+        If a list, there should be one entry for each item in ``args``.
+        The bin specification:
+
+          * If int, the number of bins for all arguments in ``args``.
+          * If array_like, the bin edges for all arguments in ``args``.
+          * If a list of ints, the number of bins  for every argument in ``args``.
+          * If a list arrays, the bin edges for each argument in ``args``
+            (required format for Dask inputs).
+          * A combination [int, array] or [array, int], where int
+            is the number of bins and array is the bin edges.
+          * If a list of DataArrays, the bins for each argument in ``args``
+            The DataArrays can be multidimensional, but must not have any
+            dimensions shared with the `dim` argument.
+
+        When bin edges are specified, all but the last (righthand-most) bin include
+        the left edge and exclude the right edge. The last bin includes both edges.
+
+        A ``TypeError`` will be raised if ``args`` contains dask arrays and
+        ``bins`` are not specified explicitly as a list of arrays.
+    weights : array_like, optional
+        An array of weights, of the same shape as `a`.  Each value in
+        `a` only contributes its associated weight towards the bin count
+        (instead of 1). If `density` is True, the weights are
+        normalized, so that the integral of the density over the range
+        remains 1. NaNs in the weights input will fill the entire bin with
+        NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
+        before running ``hist()``.
+    density : bool, optional
+        If ``False``, the result will contain the number of samples in
+        each bin. If ``True``, the result is the value of the
+        probability *density* function at the bin, normalized such that
+        the *integral* over the range is 1. Note that the sum of the
+        histogram values will not be equal to 1 unless bins of unit
+        width are chosen; it is not a probability *mass* function.
+
+    Returns
+    -------
+    hist : xarray.DataArray
+        A single dataarray which contains the values of the histogram. See
+        `density` and `weights` for a description of the possible semantics.
+
+        The returned dataarray will have one additional coordinate for each
+        dataarray supplied, named as `var_bins`, which contains the positions
+        of the centres of each bin.
+
+    Examples
+    --------
+
+    See Also
+    --------
+    DataArray.hist
+    Dataset.hist
+    numpy.histogramdd
+    dask.array.blockwise
+    """
+
+    # TODO xhistogram code goes here
+    raise NotImplementedError
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -4599,6 +4599,80 @@ def drop_duplicates(
         indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)}
         return self.isel(indexes)
 
+    def hist(self, bins=None, dim=None, weights=None, density=False):
+        """
+        Histogram applied along specified dimensions.
+
+        If the supplied arguments are chunked dask arrays it will use
+        `dask.array.blockwise` internally to parallelize over all chunks.
+
+        datarrays : xarray.DataArray objects
+            Input data. The number of input arguments determines the dimensionality of
+            the histogram. For example, two arguments produce a 2D histogram.
+        dim : tuple of strings, optional
+            Dimensions over which which the histogram is computed. The default is to
+            compute the histogram of the flattened array. i.e. over all dimensions.
+        bins :  int or array_like or a list of ints or arrays, or list of DataArrays, optional
+            If a list, there should be one entry for each item in ``args``.
+            The bin specification:
+
+              * If int, the number of bins for all arguments in ``args``.
+              * If array_like, the bin edges for all arguments in ``args``.
+              * If a list of ints, the number of bins  for every argument in ``args``.
+              * If a list arrays, the bin edges for each argument in ``args``
+                (required format for Dask inputs).
+              * A combination [int, array] or [array, int], where int
+                is the number of bins and array is the bin edges.
+              * If a list of DataArrays, the bins for each argument in ``args``
+                The DataArrays can be multidimensional, but must not have any
+                dimensions shared with the `dim` argument.
+
+            When bin edges are specified, all but the last (righthand-most) bin include
+            the left edge and exclude the right edge. The last bin includes both edges.
+
+            A ``TypeError`` will be raised if ``args`` contains dask arrays and
+            ``bins`` are not specified explicitly as a list of arrays.
+        weights : array_like, optional
+            An array of weights, of the same shape as `a`.  Each value in
+            `a` only contributes its associated weight towards the bin count
+            (instead of 1). If `density` is True, the weights are
+            normalized, so that the integral of the density over the range
+            remains 1. NaNs in the weights input will fill the entire bin with
+            NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
+            before running ``hist()``.
+        density : bool, optional
+            If ``False``, the result will contain the number of samples in
+            each bin. If ``True``, the result is the value of the
+            probability *density* function at the bin, normalized such that
+            the *integral* over the range is 1. Note that the sum of the
+            histogram values will not be equal to 1 unless bins of unit
+            width are chosen; it is not a probability *mass* function.
+
+        Returns
+        -------
+        hist : xarray.DataArray
+            A single dataarray which contains the values of the histogram. See
+            `density` and `weights` for a description of the possible semantics.
+
+            The returned dataarray will have one additional coordinate for each
+            dataarray supplied, named as `var_bins`, which contains the positions
+            of the centres of each bin.
+
+        Examples
+        --------
+
+        See Also
+        --------
+        xarray.hist
+        Dataset.hist
+        numpy.histogramdd
+        dask.array.blockwise
+        """
+
+        return computation.hist(
+            datarrays=self, dim=dim, bins=bins, weights=weights, density=density
+        )
+
     # this needs to be at the end, or mypy will confuse with `str`
     # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
     str = utils.UncachedAccessor(StringAccessor)
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -7627,3 +7627,84 @@ def _wrapper(Y, *coords_, **kwargs):
         result.attrs = self.attrs.copy()
 
         return result
+
+    def hist(self, vars=None, bins=None, dim=None, weights=None, density=False):
+        """
+        Histogram applied along specified dimensions.
+
+        If the supplied arguments are chunked dask arrays it will use
+        `dask.array.blockwise` internally to parallelize over all chunks.
+
+        vars : list of str
+            Variables on the Dataset to use as input data. The number of variables
+            determines the dimensionality of the histogram. For example, two
+            arguments produce a 2D histogram.
+        dim : tuple of strings, optional
+            Dimensions over which which the histogram is computed. The default is to
+            compute the histogram of the flattened array. i.e. over all dimensions.
+        bins :  int or array_like or a list of ints or arrays, or list of DataArrays, optional
+            If a list, there should be one entry for each item in ``args``.
+            The bin specification:
+
+              * If int, the number of bins for all arguments in ``args``.
+              * If array_like, the bin edges for all arguments in ``args``.
+              * If a list of ints, the number of bins  for every argument in ``args``.
+              * If a list arrays, the bin edges for each argument in ``args``
+                (required format for Dask inputs).
+              * A combination [int, array] or [array, int], where int
+                is the number of bins and array is the bin edges.
+              * If a list of DataArrays, the bins for each argument in ``args``
+                The DataArrays can be multidimensional, but must not have any
+                dimensions shared with the `dim` argument.
+
+            When bin edges are specified, all but the last (righthand-most) bin include
+            the left edge and exclude the right edge. The last bin includes both edges.
+
+            A ``TypeError`` will be raised if ``args`` contains dask arrays and
+            ``bins`` are not specified explicitly as a list of arrays.
+        weights : array_like, optional
+            An array of weights, of the same shape as `a`.  Each value in
+            `a` only contributes its associated weight towards the bin count
+            (instead of 1). If `density` is True, the weights are
+            normalized, so that the integral of the density over the range
+            remains 1. NaNs in the weights input will fill the entire bin with
+            NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
+            before running ``hist()``.
+        density : bool, optional
+            If ``False``, the result will contain the number of samples in
+            each bin. If ``True``, the result is the value of the
+            probability *density* function at the bin, normalized such that
+            the *integral* over the range is 1. Note that the sum of the
+            histogram values will not be equal to 1 unless bins of unit
+            width are chosen; it is not a probability *mass* function.
+
+        Returns
+        -------
+        hist : xarray.DataArray
+            A single dataarray which contains the values of the histogram. See
+            `density` and `weights` for a description of the possible semantics.
+
+            The returned dataarray will have one additional coordinate for each
+            dataarray supplied, named as `var_bins`, which contains the positions
+            of the centres of each bin.
+
+        Examples
+        --------
+
+        See Also
+        --------
+        xarray.hist
+        DataArray.hist
+        numpy.histogramdd
+        dask.array.blockwise
+        """
+
+        from .computation import hist
+
+        return hist(
+            datarrays=[self[var] for var in vars],
+            dim=dim,
+            bins=bins,
+            weights=weights,
+            density=density,
+        )
diff --git a/xarray/core/utils.py b/xarray/core/utils.py
@@ -107,6 +107,8 @@ def safe_cast_to_index(array: Any) -> pd.Index:
         index = array
     elif hasattr(array, "to_index"):
         index = array.to_index()
+    elif hasattr(array, "to_pandas_index"):
+        index = array.to_pandas_index()
     else:
         kwargs = {}
         if hasattr(array, "dtype") and array.dtype.kind == "O":

diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py
@@ -9,9 +9,8 @@
 import functools
 
 import numpy as np
-import pandas as pd
 
-from .facetgrid import _easy_facetgrid
+from .facetgrid import FacetGrid, _easy_facetgrid
 from .utils import (
     _add_colorbar,
     _assert_valid_xy,
@@ -379,6 +378,9 @@ def step(darray, *args, where="pre", drawstyle=None, ds=None, **kwargs):
 
 def hist(
     darray,
+    row=None,
+    col=None,
+    col_wrap=None,
     figsize=None,
     size=None,
     aspect=None,
@@ -391,19 +393,25 @@ def hist(
     yticks=None,
     xlim=None,
     ylim=None,
+    bins=None,
+    weights=None,
+    density=False,
     **kwargs,
 ):
     """
     Histogram of DataArray.
 
     Wraps :py:func:`matplotlib:matplotlib.pyplot.hist`.
 
-    Plots *N*-dimensional arrays by first flattening the array.
+    Plots *N*-dimensional arrays by first flattening the array and calculating
+    the histogram via :py:func:`DataArray.hist`.
 
     Parameters
     ----------
     darray : DataArray
-        Can have any number of dimensions.
+        Can have any number of dimensions, but will be reduced to 1 dimension
+        by the histogram calculation before plotting (faceted plots will retain
+        more dimensions).
     figsize : tuple, optional
         A tuple (width, height) of the figure in inches.
         Mutually exclusive with ``size`` and ``ax``.
@@ -416,16 +424,50 @@ def hist(
     ax : matplotlib axes object, optional
         Axes on which to plot. By default, use the current axes.
         Mutually exclusive with ``size`` and ``figsize``.
+    bins :  int or array_like or a list of ints or arrays, or list of DataArrays, optional
+        See :py:func:DataArray.hist
+    weights : array_like, optional
+        See :py:func:DataArray.hist
+    density : bool, optional
+        See :py:func:DataArray.hist
     **kwargs : optional
         Additional keyword arguments to :py:func:`matplotlib:matplotlib.pyplot.hist`.
 
     """
+
+    # compute the dims to count over
+    reduce_dims = set(darray.dims) - set([row, col])
+
+    # Handle facetgrids first
+    if row or col:
+        allargs = locals().copy()
+        allargs.update(allargs.pop("kwargs"))
+        allargs.pop("darray")
+
+        g = FacetGrid(
+            data=darray,
+            col=col,
+            row=row,
+            col_wrap=col_wrap,
+            sharex=False,
+            sharey=False,
+            figsize=figsize,
+            aspect=aspect,
+            size=size,
+            subplot_kws=kwargs,
+        )
+
+        return g.map(hist, **kwargs)
+
     ax = get_axis(figsize, size, aspect, ax)
 
-    no_nan = np.ravel(darray.values)
-    no_nan = no_nan[pd.notnull(no_nan)]
+    h = darray.hist(bins=bins, dim=reduce_dims, weights=weights, density=density)
+    counts = h.values
+    bins = h.coords[f"{darray.name}_bins"].values
 
-    primitive = ax.hist(no_nan, **kwargs)
+    # Use the weights kwarg to avoid recomputing histogram in matplotlib
+    # (see matplotlib.pyplot.hist docstring)
+    primitive = ax.hist(bins[:-1], weights=counts, **kwargs)
 
     ax.set_title("Histogram")
     ax.set_xlabel(label_from_attrs(darray))

diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py
@@ -7,6 +7,7 @@
 
 from xarray.coding.cftimeindex import CFTimeIndex
 from xarray.core import duck_array_ops, utils
+from xarray.core.indexes import PandasIndex
 from xarray.core.utils import either_dict_or_kwargs
 
 from . import assert_array_equal, requires_cftime, requires_dask
@@ -28,11 +29,13 @@ def test_safe_cast_to_index():
     dates = pd.date_range("2000-01-01", periods=10)
     x = np.arange(5)
     td = x * np.timedelta64(1, "D")
+    midx = pd.MultiIndex.from_tuples([(0,)], names=["a"])
     for expected, array in [
         (dates, dates.values),
         (pd.Index(x, dtype=object), x.astype(object)),
         (pd.Index(td), td),
         (pd.Index(td, dtype=object), td.astype(object)),
+        (midx, PandasIndex(midx)),
     ]:
         actual = utils.safe_cast_to_index(array)
         assert_array_equal(expected, actual)