Avoid accessing slow .data in unstack (#5906)

pydata · Oct 29, 2021 · b2ed62e · b2ed62e
1 parent 3bfa8c0
commit b2ed62e
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 27 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -90,6 +90,7 @@ Bug fixes
   By `Mike Taves <https://github.com/mwtoews>`_.
 - ``open_mfdataset()`` now accepts a single ``pathlib.Path`` object (:issue: `5881`).
   By `Panos Mavrogiorgos <https://github.com/pmav99>`_.
+- Improved performance of :py:meth:`Dataset.unstack` (:pull:`5906`). By `Tom Augspurger <https://github.com/TomAugspurger>`_.
 
 Documentation
 ~~~~~~~~~~~~~

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -4153,34 +4153,34 @@ def unstack(
                 )
 
         result = self.copy(deep=False)
-        for dim in dims:
 
-            if (
-                # Dask arrays don't support assignment by index, which the fast unstack
-                # function requires.
-                # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
-                any(is_duck_dask_array(v.data) for v in self.variables.values())
-                # Sparse doesn't currently support (though we could special-case
-                # it)
-                # https://github.com/pydata/sparse/issues/422
-                or any(
-                    isinstance(v.data, sparse_array_type)
-                    for v in self.variables.values()
-                )
-                or sparse
-                # Until https://github.com/pydata/xarray/pull/4751 is resolved,
-                # we check explicitly whether it's a numpy array. Once that is
-                # resolved, explicitly exclude pint arrays.
-                # # pint doesn't implement `np.full_like` in a way that's
-                # # currently compatible.
-                # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
-                # # or any(
-                # #     isinstance(v.data, pint_array_type) for v in self.variables.values()
-                # # )
-                or any(
-                    not isinstance(v.data, np.ndarray) for v in self.variables.values()
-                )
-            ):
+        # we want to avoid allocating an object-dtype ndarray for a MultiIndex,
+        # so we can't just access self.variables[v].data for every variable.
+        # We only check the non-index variables.
+        # https://github.com/pydata/xarray/issues/5902
+        nonindexes = [
+            self.variables[k] for k in set(self.variables) - set(self.xindexes)
+        ]
+        # Notes for each of these cases:
+        # 1. Dask arrays don't support assignment by index, which the fast unstack
+        #    function requires.
+        #    https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
+        # 2. Sparse doesn't currently support (though we could special-case it)
+        #    https://github.com/pydata/sparse/issues/422
+        # 3. pint requires checking if it's a NumPy array until
+        #    https://github.com/pydata/xarray/pull/4751 is resolved,
+        #    Once that is resolved, explicitly exclude pint arrays.
+        #    pint doesn't implement `np.full_like` in a way that's
+        #    currently compatible.
+        needs_full_reindex = sparse or any(
+            is_duck_dask_array(v.data)
+            or isinstance(v.data, sparse_array_type)
+            or not isinstance(v.data, np.ndarray)
+            for v in nonindexes
+        )
+
+        for dim in dims:
+            if needs_full_reindex:
                 result = result._unstack_full_reindex(dim, fill_value, sparse)
             else:
                 result = result._unstack_once(dim, fill_value)