Merge remote-tracking branch 'upstream/master' into fix/upstream-dev-…

…tests * upstream/master: drop_vars; deprecate drop for variables (pydata#3475) uamiv test using only raw uamiv variables (pydata#3485) Optimize dask array equality checks. (pydata#3453)
dcherian · Nov 7, 2019 · 2db94eb · 2db94eb
2 parents 8703fe2 + 0e8debf
commit 2db94eb
Show file tree

Hide file tree

Showing 19 changed files with 516 additions and 260 deletions.
diff --git a/doc/data-structures.rst b/doc/data-structures.rst
@@ -393,14 +393,14 @@ methods (like pandas) for transforming datasets into new objects.
 
 For removing variables, you can select and drop an explicit list of
 variables by indexing with a list of names or using the
-:py:meth:`~xarray.Dataset.drop` methods to return a new ``Dataset``. These
+:py:meth:`~xarray.Dataset.drop_vars` methods to return a new ``Dataset``. These
 operations keep around coordinates:
 
 .. ipython:: python
 
     ds[['temperature']]
     ds[['temperature', 'temperature_double']]
-    ds.drop('temperature')
+    ds.drop_vars('temperature')
 
 To remove a dimension, you can use :py:meth:`~xarray.Dataset.drop_dims` method.
 Any variables using that dimension are dropped:

diff --git a/doc/indexing.rst b/doc/indexing.rst
@@ -232,14 +232,14 @@ Using indexing to *assign* values to a subset of dataset (e.g.,
 Dropping labels and dimensions
 ------------------------------
 
-The :py:meth:`~xarray.Dataset.drop` method returns a new object with the listed
+The :py:meth:`~xarray.Dataset.drop_sel` method returns a new object with the listed
 index labels along a dimension dropped:
 
 .. ipython:: python
 
-    ds.drop(space=['IN', 'IL'])
+    ds.drop_sel(space=['IN', 'IL'])
 
-``drop`` is both a ``Dataset`` and ``DataArray`` method.
+``drop_sel`` is both a ``Dataset`` and ``DataArray`` method.
 
 Use :py:meth:`~xarray.Dataset.drop_dims` to drop a full dimension from a Dataset.
 Any variables with these dimensions are also dropped:

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -38,6 +38,12 @@ Breaking changes
 
 New Features
 ~~~~~~~~~~~~
+- :py:meth:`Dataset.drop_sel` & :py:meth:`DataArray.drop_sel` have been added for dropping labels.
+  :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` have been added for 
+  dropping variables (including coordinates). The existing ``drop`` methods remain as a backward compatible 
+  option for dropping either lables or variables, but using the more specific methods is encouraged.
+  (:pull:`3475`)
+  By `Maximilian Roos <https://github.com/max-sixty>`_
 - :py:meth:`Dataset.transpose` and :py:meth:`DataArray.transpose` now support an ellipsis (`...`)
   to represent all 'other' dimensions. For example, to move one dimension to the front,
   use `.transpose('x', ...)`. (:pull:`3421`)
@@ -70,6 +76,9 @@ Bug fixes
   but cloudpickle isn't (:issue:`3401`) by `Rhys Doyle <https://github.com/rdoyle45>`_
 - Fix grouping over variables with NaNs. (:issue:`2383`, :pull:`3406`).
   By `Deepak Cherian <https://github.com/dcherian>`_.
+- Use dask names to compare dask objects prior to comparing values after computation.
+  (:issue:`3068`, :issue:`3311`, :issue:`3454`, :pull:`3453`).
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 - Sync with cftime by removing `dayofwk=-1` for cftime>=1.0.4.
   By `Anderson Banihirwe <https://github.com/andersy005>`_.
 - Fix :py:meth:`xarray.core.groupby.DataArrayGroupBy.reduce` and
@@ -3749,6 +3758,7 @@ Enhancements
   explicitly listed variables or index labels:
 
   .. ipython:: python
+     :okwarning:
 
       # drop variables
       ds = xray.Dataset({'x': 0, 'y': 1})

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
@@ -2,6 +2,7 @@
 
 from . import dtypes, utils
 from .alignment import align
+from .duck_array_ops import lazy_array_equiv
 from .merge import _VALID_COMPAT, unique_variable
 from .variable import IndexVariable, Variable, as_variable
 from .variable import concat as concat_vars
@@ -189,26 +190,43 @@ def process_subset_opt(opt, subset):
                 # all nonindexes that are not the same in each dataset
                 for k in getattr(datasets[0], subset):
                     if k not in concat_over:
-                        # Compare the variable of all datasets vs. the one
-                        # of the first dataset. Perform the minimum amount of
-                        # loads in order to avoid multiple loads from disk
-                        # while keeping the RAM footprint low.
-                        v_lhs = datasets[0].variables[k].load()
-                        # We'll need to know later on if variables are equal.
-                        computed = []
-                        for ds_rhs in datasets[1:]:
-                            v_rhs = ds_rhs.variables[k].compute()
-                            computed.append(v_rhs)
-                            if not getattr(v_lhs, compat)(v_rhs):
-                                concat_over.add(k)
-                                equals[k] = False
-                                # computed variables are not to be re-computed
-                                # again in the future
-                                for ds, v in zip(datasets[1:], computed):
-                                    ds.variables[k].data = v.data
+                        equals[k] = None
+                        variables = [ds.variables[k] for ds in datasets]
+                        # first check without comparing values i.e. no computes
+                        for var in variables[1:]:
+                            equals[k] = getattr(variables[0], compat)(
+                                var, equiv=lazy_array_equiv
+                            )
+                            if equals[k] is not True:
+                                # exit early if we know these are not equal or that
+                                # equality cannot be determined i.e. one or all of
+                                # the variables wraps a numpy array
                                 break
-                        else:
-                            equals[k] = True
+
+                        if equals[k] is False:
+                            concat_over.add(k)
+
+                        elif equals[k] is None:
+                            # Compare the variable of all datasets vs. the one
+                            # of the first dataset. Perform the minimum amount of
+                            # loads in order to avoid multiple loads from disk
+                            # while keeping the RAM footprint low.
+                            v_lhs = datasets[0].variables[k].load()
+                            # We'll need to know later on if variables are equal.
+                            computed = []
+                            for ds_rhs in datasets[1:]:
+                                v_rhs = ds_rhs.variables[k].compute()
+                                computed.append(v_rhs)
+                                if not getattr(v_lhs, compat)(v_rhs):
+                                    concat_over.add(k)
+                                    equals[k] = False
+                                    # computed variables are not to be re-computed
+                                    # again in the future
+                                    for ds, v in zip(datasets[1:], computed):
+                                        ds.variables[k].data = v.data
+                                    break
+                            else:
+                                equals[k] = True
 
             elif opt == "all":
                 concat_over.update(
@@ -370,7 +388,7 @@ def ensure_common_dims(vars):
     result = result.set_coords(coord_names)
     result.encoding = result_encoding
 
-    result = result.drop(unlabeled_dims, errors="ignore")
+    result = result.drop_vars(unlabeled_dims, errors="ignore")
 
     if coord is not None:
         # add concat dimension last to ensure that its in the final Dataset

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -16,7 +16,6 @@
     TypeVar,
     Union,
     cast,
-    overload,
 )
 
 import numpy as np
@@ -54,7 +53,7 @@
 from .indexes import Indexes, default_indexes
 from .merge import PANDAS_TYPES
 from .options import OPTIONS
-from .utils import Default, ReprObject, _default, _check_inplace, either_dict_or_kwargs
+from .utils import Default, ReprObject, _check_inplace, _default, either_dict_or_kwargs
 from .variable import (
     IndexVariable,
     Variable,
@@ -250,7 +249,7 @@ class DataArray(AbstractArray, DataWithCoords):
         Dictionary for holding arbitrary metadata.
     """
 
-    _accessors: Optional[Dict[str, Any]]
+    _accessors: Optional[Dict[str, Any]]  # noqa
     _coords: Dict[Any, Variable]
     _indexes: Optional[Dict[Hashable, pd.Index]]
     _name: Optional[Hashable]
@@ -1891,41 +1890,72 @@ def transpose(self, *dims: Hashable, transpose_coords: bool = None) -> "DataArra
     def T(self) -> "DataArray":
         return self.transpose()
 
-    # Drop coords
-    @overload
-    def drop(
-        self, labels: Union[Hashable, Iterable[Hashable]], *, errors: str = "raise"
+    def drop_vars(
+        self, names: Union[Hashable, Iterable[Hashable]], *, errors: str = "raise"
     ) -> "DataArray":
-        ...
+        """Drop variables from this DataArray.
+
+        Parameters
+        ----------
+        names : hashable or iterable of hashables
+            Name(s) of variables to drop.
+        errors: {'raise', 'ignore'}, optional
+            If 'raise' (default), raises a ValueError error if any of the variable
+            passed are not in the dataset. If 'ignore', any given names that are in the
+            DataArray are dropped and no error is raised.
+
+        Returns
+        -------
+        dropped : Dataset
+
+        """
+        ds = self._to_temp_dataset().drop_vars(names, errors=errors)
+        return self._from_temp_dataset(ds)
 
-    # Drop index labels along dimension
-    @overload  # noqa: F811
     def drop(
-        self, labels: Any, dim: Hashable, *, errors: str = "raise"  # array-like
+        self,
+        labels: Mapping = None,
+        dim: Hashable = None,
+        *,
+        errors: str = "raise",
+        **labels_kwargs,
     ) -> "DataArray":
-        ...
+        """Backward compatible method based on `drop_vars` and `drop_sel`
 
-    def drop(self, labels, dim=None, *, errors="raise"):  # noqa: F811
-        """Drop coordinates or index labels from this DataArray.
+        Using either `drop_vars` or `drop_sel` is encouraged
+        """
+        ds = self._to_temp_dataset().drop(labels, dim, errors=errors)
+        return self._from_temp_dataset(ds)
+
+    def drop_sel(
+        self,
+        labels: Mapping[Hashable, Any] = None,
+        *,
+        errors: str = "raise",
+        **labels_kwargs,
+    ) -> "DataArray":
+        """Drop index labels from this DataArray.
 
         Parameters
         ----------
-        labels : hashable or sequence of hashables
-            Name(s) of coordinates or index labels to drop.
-            If dim is not None, labels can be any array-like.
-        dim : hashable, optional
-            Dimension along which to drop index labels. By default (if
-            ``dim is None``), drops coordinates rather than index labels.
+        labels : Mapping[Hashable, Any]
+            Index labels to drop
         errors: {'raise', 'ignore'}, optional
             If 'raise' (default), raises a ValueError error if
-            any of the coordinates or index labels passed are not
-            in the array. If 'ignore', any given labels that are in the
-            array are dropped and no error is raised.
+            any of the index labels passed are not
+            in the dataset. If 'ignore', any given labels that are in the
+            dataset are dropped and no error is raised.
+        **labels_kwargs : {dim: label, ...}, optional
+            The keyword arguments form of ``dim`` and ``labels``
+
         Returns
         -------
         dropped : DataArray
         """
-        ds = self._to_temp_dataset().drop(labels, dim, errors=errors)
+        if labels_kwargs or isinstance(labels, dict):
+            labels = either_dict_or_kwargs(labels, labels_kwargs, "drop")
+
+        ds = self._to_temp_dataset().drop_sel(labels, errors=errors)
         return self._from_temp_dataset(ds)
 
     def dropna(