diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b957ed4bab2..fb96ed6293c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -44,6 +44,9 @@ Bug fixes :py:class:`CFTimeIndex` and upcoming pandas version 1.3.0 (:issue:`5356`, :pull:`5359`). By `Spencer Clark `_. +- Fix 1-level multi-index incorrectly converted to single index (:issue:`5384`, + :pull:`5385`). + By `Benoit Bovy `_. Documentation diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 07b7bca27c5..fa75ba12d41 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1724,3 +1724,79 @@ def _calc_idxminmax( res.attrs = indx.attrs return res + + +def hist(*datarrays, bins=None, dim=None, weights=None, density=False): + """ + Histogram applied along specified dimensions. + + If the supplied arguments are chunked dask arrays it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + Parameters + ---------- + datarrays : xarray.DataArray objects + Input data. The number of input arguments determines the dimensionality of + the histogram. For example, two arguments produce a 2D histogram. + dim : tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened array. i.e. over all dimensions. + bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + If a list, there should be one entry for each item in ``args``. + The bin specification: + + * If int, the number of bins for all arguments in ``args``. + * If array_like, the bin edges for all arguments in ``args``. + * If a list of ints, the number of bins for every argument in ``args``. + * If a list arrays, the bin edges for each argument in ``args`` + (required format for Dask inputs). + * A combination [int, array] or [array, int], where int + is the number of bins and array is the bin edges. + * If a list of DataArrays, the bins for each argument in ``args`` + The DataArrays can be multidimensional, but must not have any + dimensions shared with the `dim` argument. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will be raised if ``args`` contains dask arrays and + ``bins`` are not specified explicitly as a list of arrays. + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. NaNs in the weights input will fill the entire bin with + NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` + before running ``hist()``. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + + Returns + ------- + hist : xarray.DataArray + A single dataarray which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate for each + dataarray supplied, named as `var_bins`, which contains the positions + of the centres of each bin. + + Examples + -------- + + See Also + -------- + DataArray.hist + Dataset.hist + numpy.histogramdd + dask.array.blockwise + """ + + # TODO xhistogram code goes here + raise NotImplementedError diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 2ab60b894e1..a4b690293ef 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4599,6 +4599,80 @@ def drop_duplicates( indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)} return self.isel(indexes) + def hist(self, bins=None, dim=None, weights=None, density=False): + """ + Histogram applied along specified dimensions. + + If the supplied arguments are chunked dask arrays it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + datarrays : xarray.DataArray objects + Input data. The number of input arguments determines the dimensionality of + the histogram. For example, two arguments produce a 2D histogram. + dim : tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened array. i.e. over all dimensions. + bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + If a list, there should be one entry for each item in ``args``. + The bin specification: + + * If int, the number of bins for all arguments in ``args``. + * If array_like, the bin edges for all arguments in ``args``. + * If a list of ints, the number of bins for every argument in ``args``. + * If a list arrays, the bin edges for each argument in ``args`` + (required format for Dask inputs). + * A combination [int, array] or [array, int], where int + is the number of bins and array is the bin edges. + * If a list of DataArrays, the bins for each argument in ``args`` + The DataArrays can be multidimensional, but must not have any + dimensions shared with the `dim` argument. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will be raised if ``args`` contains dask arrays and + ``bins`` are not specified explicitly as a list of arrays. + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. NaNs in the weights input will fill the entire bin with + NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` + before running ``hist()``. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + + Returns + ------- + hist : xarray.DataArray + A single dataarray which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate for each + dataarray supplied, named as `var_bins`, which contains the positions + of the centres of each bin. + + Examples + -------- + + See Also + -------- + xarray.hist + Dataset.hist + numpy.histogramdd + dask.array.blockwise + """ + + return computation.hist( + datarrays=self, dim=dim, bins=bins, weights=weights, density=density + ) + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = utils.UncachedAccessor(StringAccessor) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1697a7c67aa..0d57b465a5d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7627,3 +7627,84 @@ def _wrapper(Y, *coords_, **kwargs): result.attrs = self.attrs.copy() return result + + def hist(self, vars=None, bins=None, dim=None, weights=None, density=False): + """ + Histogram applied along specified dimensions. + + If the supplied arguments are chunked dask arrays it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + vars : list of str + Variables on the Dataset to use as input data. The number of variables + determines the dimensionality of the histogram. For example, two + arguments produce a 2D histogram. + dim : tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened array. i.e. over all dimensions. + bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + If a list, there should be one entry for each item in ``args``. + The bin specification: + + * If int, the number of bins for all arguments in ``args``. + * If array_like, the bin edges for all arguments in ``args``. + * If a list of ints, the number of bins for every argument in ``args``. + * If a list arrays, the bin edges for each argument in ``args`` + (required format for Dask inputs). + * A combination [int, array] or [array, int], where int + is the number of bins and array is the bin edges. + * If a list of DataArrays, the bins for each argument in ``args`` + The DataArrays can be multidimensional, but must not have any + dimensions shared with the `dim` argument. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will be raised if ``args`` contains dask arrays and + ``bins`` are not specified explicitly as a list of arrays. + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. NaNs in the weights input will fill the entire bin with + NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` + before running ``hist()``. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + + Returns + ------- + hist : xarray.DataArray + A single dataarray which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate for each + dataarray supplied, named as `var_bins`, which contains the positions + of the centres of each bin. + + Examples + -------- + + See Also + -------- + xarray.hist + DataArray.hist + numpy.histogramdd + dask.array.blockwise + """ + + from .computation import hist + + return hist( + datarrays=[self[var] for var in vars], + dim=dim, + bins=bins, + weights=weights, + density=density, + ) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index aa51366f588..3a9f418063d 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -107,6 +107,8 @@ def safe_cast_to_index(array: Any) -> pd.Index: index = array elif hasattr(array, "to_index"): index = array.to_index() + elif hasattr(array, "to_pandas_index"): + index = array.to_pandas_index() else: kwargs = {} if hasattr(array, "dtype") and array.dtype.kind == "O": diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index 8b5bcbedb7d..483fa723058 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -7,6 +7,7 @@ from xarray.coding.cftimeindex import CFTimeIndex from xarray.core import duck_array_ops, utils +from xarray.core.indexes import PandasIndex from xarray.core.utils import either_dict_or_kwargs from . import assert_array_equal, requires_cftime, requires_dask @@ -28,11 +29,13 @@ def test_safe_cast_to_index(): dates = pd.date_range("2000-01-01", periods=10) x = np.arange(5) td = x * np.timedelta64(1, "D") + midx = pd.MultiIndex.from_tuples([(0,)], names=["a"]) for expected, array in [ (dates, dates.values), (pd.Index(x, dtype=object), x.astype(object)), (pd.Index(td), td), (pd.Index(td, dtype=object), td.astype(object)), + (midx, PandasIndex(midx)), ]: actual = utils.safe_cast_to_index(array) assert_array_equal(expected, actual)