Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix unstack #67

Merged
merged 11 commits into from
Sep 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ New features and enhancements
* New functions ``diagnostics.properties_and_measures``, ``diagnostics.measures_heatmap`` and ``diagnostics.measures_improvement``. (:issue:`5`, :pull:`54`)
* Add argument `resample_methods` to `xs.extract.resample`. (:issue:`57`, :pull:`57`)
* Added a ReadTheDocs configuration to expose public documentation. (:issue:`65`, :pull:`66`).
* ``xs.utils.stack_drop_nans``/ ``xs.utils.unstack_fill_nan`` will now format the `to_file`/`coords` string to add the domain and the shape. (:issue:`59`, :pull:`67`)


Breaking changes
^^^^^^^^^^^^^^^^
Expand Down
1 change: 1 addition & 0 deletions xscen/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ def compute_deltas(
return deltas


@parse_config
def spatial_mean(
ds: xr.Dataset,
method: str,
Expand Down
2 changes: 2 additions & 0 deletions xscen/diagnostics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from xclim.sdba import measures

from .catalog import DataCatalog
from .config import parse_config
from .indicators import load_xclim_module
from .io import save_to_zarr
from .utils import change_units, maybe_unstack, unstack_fill_nan
Expand Down Expand Up @@ -113,6 +114,7 @@ def _invert_unphysical_temperatures(
# TODO: just measures?


@parse_config
def properties_and_measures(
ds: xr.Dataset,
properties: Union[
Expand Down
4 changes: 3 additions & 1 deletion xscen/ensembles.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
import xarray as xr
from xclim import ensembles

from .catalog import generate_id # ProjectCatalog
from .catalog import generate_id
from .config import parse_config

logger = logging.getLogger(__name__)

__all__ = ["ensemble_stats"]


@parse_config
def ensemble_stats(
datasets: Any,
create_kwargs: dict = None,
Expand Down
2 changes: 2 additions & 0 deletions xscen/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ def clean_incomplete(path: Union[str, os.PathLike], complete: Sequence[str]) ->
sh.rmtree(fold)


@parse_config
def save_to_netcdf(
ds: xr.Dataset,
filename: str,
Expand Down Expand Up @@ -346,6 +347,7 @@ def coerce_attrs(attrs):
ds.to_netcdf(filename, **netcdf_kwargs)


@parse_config
def save_to_zarr(
ds: xr.Dataset,
filename: str,
Expand Down
63 changes: 62 additions & 1 deletion xscen/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,17 @@ def stack_drop_nans(
The name of the new stacked dim.
to_file : str, optional
A netCDF filename where to write the stacked coords for use in `unstack_fill_nan`.
If given a string with {shape} and {domain}, the formatting will fill them with
the original shape of the dataset and the global attributes 'cat:domain'.
If None (default), nothing is written to disk.
It is recommended to fill this argument in the config. It will be parsed automatically.
E.g.:

utils:
stack_drop_nans:
to_file: /some_path/coords/coords_{domain}_{shape}.nc
unstack_fill_nan:
coords: /some_path/coords/coords_{domain}_{shape}.nc

Returns
-------
Expand All @@ -106,13 +116,39 @@ def stack_drop_nans(
--------
unstack_fill_nan : The inverse operation.
"""

original_shape = "x".join(map(str, mask.shape))

mask_1d = mask.stack({new_dim: mask.dims})
out = ds.stack({new_dim: mask.dims}).where(mask_1d, drop=True).reset_index(new_dim)
out = (
ds.stack({new_dim: mask.dims})
.where(mask_1d, drop=True)
.reset_index(new_dim, drop=True)
)
for dim in mask.dims:
out[dim].attrs.update(ds[dim].attrs)

if to_file is not None:
# set default path to store the information necessary to unstack
# the name includes the domain and the original shape to uniquely identify the dataset
domain = ds.attrs.get("cat:domain", "unknown")
to_file = to_file.format(domain=domain, shape=original_shape)
if not Path(to_file).parent.exists():
os.mkdir(Path(to_file).parent)
mask.coords.to_dataset().to_netcdf(to_file)

# carry information about original shape to be able to unstack properly
for dim in mask.dims:
out[dim].attrs["original_shape"] = original_shape

# this is needed to fix a bug in xarray '2022.6.0'
out[dim] = xr.DataArray(
out[dim].values,
dims=out[dim].dims,
coords=out[dim].coords,
attrs=out[dim].attrs,
)

return out


Expand All @@ -133,6 +169,18 @@ def unstack_fill_nan(
dimensions, those original dimensions must be listed here.
If a dict : a mapping from the name to the array of the coords to unstack
If a str : a filename to a dataset containing only those coords (as coords).
If given a string with {shape} and {domain}, the formatting will fill them with
the original shape of the dataset (that should have been store in the
attributes of the stacked dimensions) by `stack_drop_nans` and the global attributes 'cat:domain'.
It is recommended to fill this argument in the config. It will be parsed automatically.
E.g.:

utils:
stack_drop_nans:
to_file: /some_path/coords/coords_{domain}_{shape}.nc
unstack_fill_nan:
coords: /some_path/coords/coords_{domain}_{shape}.nc

If None (default), all coords that have `dim` a single dimension are used as the
new dimensions/coords in the unstacked output.
Coordinates will be loaded within this function.
Expand All @@ -143,6 +191,9 @@ def unstack_fill_nan(
Same as `ds`, but `dim` has been unstacked to coordinates in `coords`.
Missing elements are filled according to the defaults of `fill_value` of :py:meth:`xarray.Dataset.unstack`.
"""
if coords is None:
logger.info("Dataset unstacked using no coords argument.")

if isinstance(coords, (list, tuple)):
dims, crds = zip(*[(name, ds[name].load().values) for name in coords])
else:
Expand All @@ -162,6 +213,14 @@ def unstack_fill_nan(

if not isinstance(coords, (list, tuple)) and coords is not None:
if isinstance(coords, (str, os.PathLike)):
# find original shape in the attrs of one of the dimension
original_shape = "unknown"
for dim in ds.dims:
if "original_shape" in dim.attrs:
original_shape = ds[dim].attrs["original_shape"]
domain = ds.attrs.get("cat:domain", "unknown")
coords = coords.format(domain=domain, shape=original_shape)
logger.info(f"Dataset unstacked using {coords}.")
coords = xr.open_dataset(coords)
out = out.reindex(**coords.coords)

Expand Down Expand Up @@ -193,6 +252,7 @@ def get_cat_attrs(ds: Union[xr.Dataset, dict]):
return {k[4:]: v for k, v in attrs.items() if k.startswith("cat:")}


@parse_config
def maybe_unstack(
ds: xr.Dataset,
coords: str = None,
Expand Down Expand Up @@ -362,6 +422,7 @@ def change_units(ds: xr.Dataset, variables_and_units: dict) -> xr.Dataset:
return ds


@parse_config
def clean_up(
ds: xr.Dataset,
variables_and_units: Optional[dict] = None,
Expand Down