Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multi dimensional histogram (see #5400 instead) #5398

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ Bug fixes
:py:class:`CFTimeIndex` and upcoming pandas version 1.3.0 (:issue:`5356`,
:pull:`5359`).
By `Spencer Clark <https://github.com/spencerkclark>`_.
- Fix 1-level multi-index incorrectly converted to single index (:issue:`5384`,
:pull:`5385`).
By `Benoit Bovy <https://github.com/benbovy>`_.


Documentation
Expand Down
76 changes: 76 additions & 0 deletions xarray/core/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1724,3 +1724,79 @@ def _calc_idxminmax(
res.attrs = indx.attrs

return res


def hist(*datarrays, bins=None, dim=None, weights=None, density=False):
"""
Histogram applied along specified dimensions.

If the supplied arguments are chunked dask arrays it will use
`dask.array.blockwise` internally to parallelize over all chunks.

Parameters
----------
datarrays : xarray.DataArray objects
Input data. The number of input arguments determines the dimensionality of
the histogram. For example, two arguments produce a 2D histogram.
dim : tuple of strings, optional
Dimensions over which which the histogram is computed. The default is to
compute the histogram of the flattened array. i.e. over all dimensions.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
If a list, there should be one entry for each item in ``args``.
The bin specification:

* If int, the number of bins for all arguments in ``args``.
* If array_like, the bin edges for all arguments in ``args``.
* If a list of ints, the number of bins for every argument in ``args``.
* If a list arrays, the bin edges for each argument in ``args``
(required format for Dask inputs).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.
* If a list of DataArrays, the bins for each argument in ``args``
The DataArrays can be multidimensional, but must not have any
dimensions shared with the `dim` argument.

When bin edges are specified, all but the last (righthand-most) bin include
the left edge and exclude the right edge. The last bin includes both edges.

A ``TypeError`` will be raised if ``args`` contains dask arrays and
``bins`` are not specified explicitly as a list of arrays.
weights : array_like, optional
An array of weights, of the same shape as `a`. Each value in
`a` only contributes its associated weight towards the bin count
(instead of 1). If `density` is True, the weights are
normalized, so that the integral of the density over the range
remains 1. NaNs in the weights input will fill the entire bin with
NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
before running ``hist()``.
density : bool, optional
If ``False``, the result will contain the number of samples in
each bin. If ``True``, the result is the value of the
probability *density* function at the bin, normalized such that
the *integral* over the range is 1. Note that the sum of the
histogram values will not be equal to 1 unless bins of unit
width are chosen; it is not a probability *mass* function.

Returns
-------
hist : xarray.DataArray
A single dataarray which contains the values of the histogram. See
`density` and `weights` for a description of the possible semantics.

The returned dataarray will have one additional coordinate for each
dataarray supplied, named as `var_bins`, which contains the positions
of the centres of each bin.

Examples
--------

See Also
--------
DataArray.hist
Dataset.hist
numpy.histogramdd
dask.array.blockwise
"""

# TODO xhistogram code goes here
raise NotImplementedError
74 changes: 74 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4599,6 +4599,80 @@ def drop_duplicates(
indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)}
return self.isel(indexes)

def hist(self, bins=None, dim=None, weights=None, density=False):
"""
Histogram applied along specified dimensions.

If the supplied arguments are chunked dask arrays it will use
`dask.array.blockwise` internally to parallelize over all chunks.

datarrays : xarray.DataArray objects
Input data. The number of input arguments determines the dimensionality of
the histogram. For example, two arguments produce a 2D histogram.
dim : tuple of strings, optional
Dimensions over which which the histogram is computed. The default is to
compute the histogram of the flattened array. i.e. over all dimensions.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
If a list, there should be one entry for each item in ``args``.
The bin specification:

* If int, the number of bins for all arguments in ``args``.
* If array_like, the bin edges for all arguments in ``args``.
* If a list of ints, the number of bins for every argument in ``args``.
* If a list arrays, the bin edges for each argument in ``args``
(required format for Dask inputs).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.
* If a list of DataArrays, the bins for each argument in ``args``
The DataArrays can be multidimensional, but must not have any
dimensions shared with the `dim` argument.

When bin edges are specified, all but the last (righthand-most) bin include
the left edge and exclude the right edge. The last bin includes both edges.

A ``TypeError`` will be raised if ``args`` contains dask arrays and
``bins`` are not specified explicitly as a list of arrays.
weights : array_like, optional
An array of weights, of the same shape as `a`. Each value in
`a` only contributes its associated weight towards the bin count
(instead of 1). If `density` is True, the weights are
normalized, so that the integral of the density over the range
remains 1. NaNs in the weights input will fill the entire bin with
NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
before running ``hist()``.
density : bool, optional
If ``False``, the result will contain the number of samples in
each bin. If ``True``, the result is the value of the
probability *density* function at the bin, normalized such that
the *integral* over the range is 1. Note that the sum of the
histogram values will not be equal to 1 unless bins of unit
width are chosen; it is not a probability *mass* function.

Returns
-------
hist : xarray.DataArray
A single dataarray which contains the values of the histogram. See
`density` and `weights` for a description of the possible semantics.

The returned dataarray will have one additional coordinate for each
dataarray supplied, named as `var_bins`, which contains the positions
of the centres of each bin.

Examples
--------

See Also
--------
xarray.hist
Dataset.hist
numpy.histogramdd
dask.array.blockwise
"""

return computation.hist(
datarrays=self, dim=dim, bins=bins, weights=weights, density=density
)

# this needs to be at the end, or mypy will confuse with `str`
# https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
str = utils.UncachedAccessor(StringAccessor)
81 changes: 81 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7627,3 +7627,84 @@ def _wrapper(Y, *coords_, **kwargs):
result.attrs = self.attrs.copy()

return result

def hist(self, vars=None, bins=None, dim=None, weights=None, density=False):
"""
Histogram applied along specified dimensions.

If the supplied arguments are chunked dask arrays it will use
`dask.array.blockwise` internally to parallelize over all chunks.

vars : list of str
Variables on the Dataset to use as input data. The number of variables
determines the dimensionality of the histogram. For example, two
arguments produce a 2D histogram.
dim : tuple of strings, optional
Dimensions over which which the histogram is computed. The default is to
compute the histogram of the flattened array. i.e. over all dimensions.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
If a list, there should be one entry for each item in ``args``.
The bin specification:

* If int, the number of bins for all arguments in ``args``.
* If array_like, the bin edges for all arguments in ``args``.
* If a list of ints, the number of bins for every argument in ``args``.
* If a list arrays, the bin edges for each argument in ``args``
(required format for Dask inputs).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.
* If a list of DataArrays, the bins for each argument in ``args``
The DataArrays can be multidimensional, but must not have any
dimensions shared with the `dim` argument.

When bin edges are specified, all but the last (righthand-most) bin include
the left edge and exclude the right edge. The last bin includes both edges.

A ``TypeError`` will be raised if ``args`` contains dask arrays and
``bins`` are not specified explicitly as a list of arrays.
weights : array_like, optional
An array of weights, of the same shape as `a`. Each value in
`a` only contributes its associated weight towards the bin count
(instead of 1). If `density` is True, the weights are
normalized, so that the integral of the density over the range
remains 1. NaNs in the weights input will fill the entire bin with
NaNs. If there are NaNs in the weights input call ``.fillna(0.)``
before running ``hist()``.
density : bool, optional
If ``False``, the result will contain the number of samples in
each bin. If ``True``, the result is the value of the
probability *density* function at the bin, normalized such that
the *integral* over the range is 1. Note that the sum of the
histogram values will not be equal to 1 unless bins of unit
width are chosen; it is not a probability *mass* function.

Returns
-------
hist : xarray.DataArray
A single dataarray which contains the values of the histogram. See
`density` and `weights` for a description of the possible semantics.

The returned dataarray will have one additional coordinate for each
dataarray supplied, named as `var_bins`, which contains the positions
of the centres of each bin.

Examples
--------

See Also
--------
xarray.hist
DataArray.hist
numpy.histogramdd
dask.array.blockwise
"""

from .computation import hist

return hist(
datarrays=[self[var] for var in vars],
dim=dim,
bins=bins,
weights=weights,
density=density,
)
2 changes: 2 additions & 0 deletions xarray/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def safe_cast_to_index(array: Any) -> pd.Index:
index = array
elif hasattr(array, "to_index"):
index = array.to_index()
elif hasattr(array, "to_pandas_index"):
index = array.to_pandas_index()
else:
kwargs = {}
if hasattr(array, "dtype") and array.dtype.kind == "O":
Expand Down
56 changes: 49 additions & 7 deletions xarray/plot/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
import functools

import numpy as np
import pandas as pd

from .facetgrid import _easy_facetgrid
from .facetgrid import FacetGrid, _easy_facetgrid
from .utils import (
_add_colorbar,
_assert_valid_xy,
Expand Down Expand Up @@ -379,6 +378,9 @@ def step(darray, *args, where="pre", drawstyle=None, ds=None, **kwargs):

def hist(
darray,
row=None,
col=None,
col_wrap=None,
figsize=None,
size=None,
aspect=None,
Expand All @@ -391,19 +393,25 @@ def hist(
yticks=None,
xlim=None,
ylim=None,
bins=None,
weights=None,
density=False,
**kwargs,
):
"""
Histogram of DataArray.

Wraps :py:func:`matplotlib:matplotlib.pyplot.hist`.

Plots *N*-dimensional arrays by first flattening the array.
Plots *N*-dimensional arrays by first flattening the array and calculating
the histogram via :py:func:`DataArray.hist`.

Parameters
----------
darray : DataArray
Can have any number of dimensions.
Can have any number of dimensions, but will be reduced to 1 dimension
by the histogram calculation before plotting (faceted plots will retain
more dimensions).
figsize : tuple, optional
A tuple (width, height) of the figure in inches.
Mutually exclusive with ``size`` and ``ax``.
Expand All @@ -416,16 +424,50 @@ def hist(
ax : matplotlib axes object, optional
Axes on which to plot. By default, use the current axes.
Mutually exclusive with ``size`` and ``figsize``.
bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional
See :py:func:DataArray.hist
weights : array_like, optional
See :py:func:DataArray.hist
density : bool, optional
See :py:func:DataArray.hist
**kwargs : optional
Additional keyword arguments to :py:func:`matplotlib:matplotlib.pyplot.hist`.

"""

# compute the dims to count over
reduce_dims = set(darray.dims) - set([row, col])

# Handle facetgrids first
if row or col:
allargs = locals().copy()
allargs.update(allargs.pop("kwargs"))
allargs.pop("darray")

g = FacetGrid(
data=darray,
col=col,
row=row,
col_wrap=col_wrap,
sharex=False,
sharey=False,
figsize=figsize,
aspect=aspect,
size=size,
subplot_kws=kwargs,
)

return g.map(hist, **kwargs)

ax = get_axis(figsize, size, aspect, ax)

no_nan = np.ravel(darray.values)
no_nan = no_nan[pd.notnull(no_nan)]
h = darray.hist(bins=bins, dim=reduce_dims, weights=weights, density=density)
counts = h.values
bins = h.coords[f"{darray.name}_bins"].values

primitive = ax.hist(no_nan, **kwargs)
# Use the weights kwarg to avoid recomputing histogram in matplotlib
# (see matplotlib.pyplot.hist docstring)
primitive = ax.hist(bins[:-1], weights=counts, **kwargs)

ax.set_title("Histogram")
ax.set_xlabel(label_from_attrs(darray))
Expand Down
3 changes: 3 additions & 0 deletions xarray/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from xarray.coding.cftimeindex import CFTimeIndex
from xarray.core import duck_array_ops, utils
from xarray.core.indexes import PandasIndex
from xarray.core.utils import either_dict_or_kwargs

from . import assert_array_equal, requires_cftime, requires_dask
Expand All @@ -28,11 +29,13 @@ def test_safe_cast_to_index():
dates = pd.date_range("2000-01-01", periods=10)
x = np.arange(5)
td = x * np.timedelta64(1, "D")
midx = pd.MultiIndex.from_tuples([(0,)], names=["a"])
for expected, array in [
(dates, dates.values),
(pd.Index(x, dtype=object), x.astype(object)),
(pd.Index(td), td),
(pd.Index(td, dtype=object), td.astype(object)),
(midx, PandasIndex(midx)),
]:
actual = utils.safe_cast_to_index(array)
assert_array_equal(expected, actual)
Expand Down