From e68c168a3278b2e8cd32c80853d8f064b259be26 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Mon, 15 May 2023 22:08:08 +0200 Subject: [PATCH 1/4] Improve to_dask_dataframe performance --- xarray/core/dataset.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2336883d0b7..442a359f6f7 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6403,7 +6403,14 @@ def to_dask_dataframe( columns.extend(k for k in self.coords if k not in self.dims) columns.extend(self.data_vars) + has_many_dims = len(ordered_dims) > 1 + if has_many_dims: + ds_chunks = self.chunks + else: + ds_chunks = {} + series_list = [] + df_meta = pd.DataFrame() for name in columns: try: var = self.variables[name] @@ -6422,8 +6429,13 @@ def to_dask_dataframe( if not is_duck_dask_array(var._data): var = var.chunk() - dask_array = var.set_dims(ordered_dims).chunk(self.chunks).data - series = dd.from_array(dask_array.reshape(-1), columns=[name]) + if has_many_dims: + # Broadcast then flatten the array: + var_new_dims = var.set_dims(ordered_dims).chunk(ds_chunks) + dask_array = var_new_dims._data.reshape(-1) + else: + dask_array = var._data + series = dd.from_dask_array(dask_array, columns=name, meta=df_meta) series_list.append(series) df = dd.concat(series_list, axis=1) From fa781de1d8949d9a86dcd14e2721f57f424f8034 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 20 May 2023 21:43:07 +0200 Subject: [PATCH 2/4] Add ASV test --- asv_bench/benchmarks/pandas.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py index 2a296ecc4d0..f1e725997d6 100644 --- a/asv_bench/benchmarks/pandas.py +++ b/asv_bench/benchmarks/pandas.py @@ -29,6 +29,7 @@ def time_from_series(self, dtype, subset): class ToDataFrame: def setup(self, *args, **kwargs): xp = kwargs.get("xp", np) + nvars = kwargs.get("nvars", 1) random_kws = kwargs.get("random_kws", {}) method = kwargs.get("method", "to_dataframe") @@ -36,11 +37,12 @@ def setup(self, *args, **kwargs): dim2 = 10_000 ds = xr.Dataset( { - "x": xr.DataArray( + f"x_{i}": xr.DataArray( data=xp.random.random((dim1, dim2), **random_kws), dims=["dim1", "dim2"], coords={"dim1": np.arange(0, dim1), "dim2": np.arange(0, dim2)}, ) + for i in range(nvars) } ) self.to_frame = getattr(ds, method) @@ -58,4 +60,6 @@ def setup(self, *args, **kwargs): import dask.array as da - super().setup(xp=da, random_kws=dict(chunks=5000), method="to_dask_dataframe") + super().setup( + xp=da, random_kws=dict(chunks=5000), method="to_dask_dataframe", nvars=500 + ) From 62ad6b4377fbf1da33a4296d6e13c6c5dfb7524f Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sun, 21 May 2023 09:46:12 +0200 Subject: [PATCH 3/4] Update pandas.py --- asv_bench/benchmarks/pandas.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py index f1e725997d6..9bda5970a4c 100644 --- a/asv_bench/benchmarks/pandas.py +++ b/asv_bench/benchmarks/pandas.py @@ -35,15 +35,14 @@ def setup(self, *args, **kwargs): dim1 = 10_000 dim2 = 10_000 + + var = xr.Variable( + dims=("dim1", "dim2"), data=xp.random.random((dim1, dim2), **random_kws) + ) + data_vars = {f"long_name_{v}": (("dim1", "dim2"), var) for v in range(nvars)} + ds = xr.Dataset( - { - f"x_{i}": xr.DataArray( - data=xp.random.random((dim1, dim2), **random_kws), - dims=["dim1", "dim2"], - coords={"dim1": np.arange(0, dim1), "dim2": np.arange(0, dim2)}, - ) - for i in range(nvars) - } + data_vars, coords={"dim1": np.arange(0, dim1), "dim2": np.arange(0, dim2)} ) self.to_frame = getattr(ds, method) From 8162a9ae8b0b9b295b85ac97f1991b998335874b Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sun, 21 May 2023 10:37:44 +0200 Subject: [PATCH 4/4] Update dataset.py --- xarray/core/dataset.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 8acc59aaed1..433c724cc21 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6465,11 +6465,7 @@ def to_dask_dataframe( columns.extend(k for k in self.coords if k not in self.dims) columns.extend(self.data_vars) - has_many_dims = len(ordered_dims) > 1 - if has_many_dims: - ds_chunks = self.chunks - else: - ds_chunks = {} + ds_chunks = self.chunks series_list = [] df_meta = pd.DataFrame() @@ -6491,12 +6487,10 @@ def to_dask_dataframe( if not is_duck_dask_array(var._data): var = var.chunk() - if has_many_dims: - # Broadcast then flatten the array: - var_new_dims = var.set_dims(ordered_dims).chunk(ds_chunks) - dask_array = var_new_dims._data.reshape(-1) - else: - dask_array = var._data + # Broadcast then flatten the array: + var_new_dims = var.set_dims(ordered_dims).chunk(ds_chunks) + dask_array = var_new_dims._data.reshape(-1) + series = dd.from_dask_array(dask_array, columns=name, meta=df_meta) series_list.append(series)