Skip to content

Commit

Permalink
Implement more Variable Coders (#7719)
Browse files Browse the repository at this point in the history
* implement coders, adapt tests

* Apply suggestions from code review

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>

* add whats-new.rst entry

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix whats-new.rst entry

* add PR link to whats-new.rst entry

* return early if no missing values defined

* fix check

---------

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Apr 7, 2023
1 parent f8127fc commit 551de70
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 139 deletions.
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ Internal Changes
- Remove internal support for reading GRIB files through the ``cfgrib`` backend. ``cfgrib`` now uses the external
backend interface, so no existing code should break.
By `Deepak Cherian <https://github.com/dcherian>`_.
- Implement CF coding functions in ``VariableCoders`` (:pull:`7719`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_

- Added a config.yml file with messages for the welcome bot when a Github user creates their first ever issue or pull request or has their first PR merged. (:issue:`7685`, :pull:`7685`)
By `Nishtha P <https://github.com/nishthap981>`_.
Expand Down
179 changes: 171 additions & 8 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,71 @@ def __repr__(self) -> str:
)


class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from non-native to native endianness
This is useful for decoding arrays from netCDF3 files (which are all
big endian) into native endianness, so they can be used with Cython
functions, such as those found in bottleneck and pandas.
>>> x = np.arange(5, dtype=">i2")
>>> x.dtype
dtype('>i2')
>>> NativeEndiannessArray(x).dtype
dtype('int16')
>>> indexer = indexing.BasicIndexer((slice(None),))
>>> NativeEndiannessArray(x)[indexer].dtype
dtype('int16')
"""

__slots__ = ("array",)

def __init__(self, array) -> None:
self.array = indexing.as_indexable(array)

@property
def dtype(self) -> np.dtype:
return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))

def __getitem__(self, key) -> np.ndarray:
return np.asarray(self.array[key], dtype=self.dtype)


class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from integer to boolean datatype
This is useful for decoding boolean arrays from integer typed netCDF
variables.
>>> x = np.array([1, 0, 1, 1, 0], dtype="i1")
>>> x.dtype
dtype('int8')
>>> BoolTypeArray(x).dtype
dtype('bool')
>>> indexer = indexing.BasicIndexer((slice(None),))
>>> BoolTypeArray(x)[indexer].dtype
dtype('bool')
"""

__slots__ = ("array",)

def __init__(self, array) -> None:
self.array = indexing.as_indexable(array)

@property
def dtype(self) -> np.dtype:
return np.dtype("bool")

def __getitem__(self, key) -> np.ndarray:
return np.asarray(self.array[key], dtype=self.dtype)


def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike):
"""Lazily apply an element-wise function to an array.
Parameters
Expand Down Expand Up @@ -159,27 +224,29 @@ def encode(self, variable: Variable, name: T_Name = None):
fv = encoding.get("_FillValue")
mv = encoding.get("missing_value")

if (
fv is not None
and mv is not None
and not duck_array_ops.allclose_or_equiv(fv, mv)
):
fv_exists = fv is not None
mv_exists = mv is not None

if not fv_exists and not mv_exists:
return variable

if fv_exists and mv_exists and not duck_array_ops.allclose_or_equiv(fv, mv):
raise ValueError(
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
)

if fv is not None:
if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
encoding["_FillValue"] = dtype.type(fv)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
if not pd.isnull(fill_value):
data = duck_array_ops.fillna(data, fill_value)

if mv is not None:
if mv_exists:
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = dtype.type(mv)
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and fv is None:
if not pd.isnull(fill_value) and not fv_exists:
data = duck_array_ops.fillna(data, fill_value)

return Variable(dims, data, attrs, encoding, fastpath=True)
Expand Down Expand Up @@ -349,3 +416,99 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class DefaultFillvalueCoder(VariableCoder):
"""Encode default _FillValue if needed."""

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)
# make NaN the fill value for float types
if (
"_FillValue" not in attrs
and "_FillValue" not in encoding
and np.issubdtype(variable.dtype, np.floating)
):
attrs["_FillValue"] = variable.dtype.type(np.nan)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
raise NotImplementedError()


class BooleanCoder(VariableCoder):
"""Code boolean values."""

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if (
(variable.dtype == bool)
and ("dtype" not in variable.encoding)
and ("dtype" not in variable.attrs)
):
dims, data, attrs, encoding = unpack_for_encoding(variable)
attrs["dtype"] = "bool"
data = duck_array_ops.astype(data, dtype="i1", copy=True)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if variable.attrs.get("dtype", False) == "bool":
dims, data, attrs, encoding = unpack_for_decoding(variable)
del attrs["dtype"]
data = BoolTypeArray(data)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class EndianCoder(VariableCoder):
"""Decode Endianness to native."""

def encode(self):
raise NotImplementedError()

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_decoding(variable)
if not data.dtype.isnative:
data = NativeEndiannessArray(data)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class NonStringCoder(VariableCoder):
"""Encode NonString variables if dtypes differ."""

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if "dtype" in variable.encoding and variable.encoding["dtype"] not in (
"S1",
str,
):
dims, data, attrs, encoding = unpack_for_encoding(variable)
dtype = np.dtype(encoding.pop("dtype"))
if dtype != variable.dtype:
if np.issubdtype(dtype, np.integer):
if (
np.issubdtype(variable.dtype, np.floating)
and "_FillValue" not in variable.attrs
and "missing_value" not in variable.attrs
):
warnings.warn(
f"saving variable {name} with floating "
"point data as an integer dtype without "
"any _FillValue to use for NaNs",
SerializationWarning,
stacklevel=10,
)
data = np.around(data)
data = data.astype(dtype=dtype)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self):
raise NotImplementedError()
139 changes: 11 additions & 128 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from xarray.coding import strings, times, variables
from xarray.coding.variables import SerializationWarning, pop_to
from xarray.core import duck_array_ops, indexing
from xarray.core import indexing
from xarray.core.common import (
_contains_datetime_like_objects,
contains_cftime_datetimes,
Expand Down Expand Up @@ -48,123 +48,10 @@
T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore]


class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from non-native to native endianness
This is useful for decoding arrays from netCDF3 files (which are all
big endian) into native endianness, so they can be used with Cython
functions, such as those found in bottleneck and pandas.
>>> x = np.arange(5, dtype=">i2")
>>> x.dtype
dtype('>i2')
>>> NativeEndiannessArray(x).dtype
dtype('int16')
>>> indexer = indexing.BasicIndexer((slice(None),))
>>> NativeEndiannessArray(x)[indexer].dtype
dtype('int16')
"""

__slots__ = ("array",)

def __init__(self, array):
self.array = indexing.as_indexable(array)

@property
def dtype(self):
return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))

def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)


class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from integer to boolean datatype
This is useful for decoding boolean arrays from integer typed netCDF
variables.
>>> x = np.array([1, 0, 1, 1, 0], dtype="i1")
>>> x.dtype
dtype('int8')
>>> BoolTypeArray(x).dtype
dtype('bool')
>>> indexer = indexing.BasicIndexer((slice(None),))
>>> BoolTypeArray(x)[indexer].dtype
dtype('bool')
"""

__slots__ = ("array",)

def __init__(self, array):
self.array = indexing.as_indexable(array)

@property
def dtype(self):
return np.dtype("bool")

def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)


def _var_as_tuple(var: Variable) -> T_VarTuple:
return var.dims, var.data, var.attrs.copy(), var.encoding.copy()


def maybe_encode_nonstring_dtype(var: Variable, name: T_Name = None) -> Variable:
if "dtype" in var.encoding and var.encoding["dtype"] not in ("S1", str):
dims, data, attrs, encoding = _var_as_tuple(var)
dtype = np.dtype(encoding.pop("dtype"))
if dtype != var.dtype:
if np.issubdtype(dtype, np.integer):
if (
np.issubdtype(var.dtype, np.floating)
and "_FillValue" not in var.attrs
and "missing_value" not in var.attrs
):
warnings.warn(
f"saving variable {name} with floating "
"point data as an integer dtype without "
"any _FillValue to use for NaNs",
SerializationWarning,
stacklevel=10,
)
data = np.around(data)
data = data.astype(dtype=dtype)
var = Variable(dims, data, attrs, encoding, fastpath=True)
return var


def maybe_default_fill_value(var: Variable) -> Variable:
# make NaN the fill value for float types:
if (
"_FillValue" not in var.attrs
and "_FillValue" not in var.encoding
and np.issubdtype(var.dtype, np.floating)
):
var.attrs["_FillValue"] = var.dtype.type(np.nan)
return var


def maybe_encode_bools(var: Variable) -> Variable:
if (
(var.dtype == bool)
and ("dtype" not in var.encoding)
and ("dtype" not in var.attrs)
):
dims, data, attrs, encoding = _var_as_tuple(var)
attrs["dtype"] = "bool"
data = duck_array_ops.astype(data, dtype="i1", copy=True)
var = Variable(dims, data, attrs, encoding, fastpath=True)
return var


def _infer_dtype(array, name: T_Name = None) -> np.dtype:
"""Given an object array with no missing values, infer its dtype from its
first element
Expand Down Expand Up @@ -292,13 +179,13 @@ def encode_cf_variable(
variables.CFScaleOffsetCoder(),
variables.CFMaskCoder(),
variables.UnsignedIntegerCoder(),
variables.NonStringCoder(),
variables.DefaultFillvalueCoder(),
variables.BooleanCoder(),
]:
var = coder.encode(var, name=name)

# TODO(shoyer): convert all of these to use coders, too:
var = maybe_encode_nonstring_dtype(var, name=name)
var = maybe_default_fill_value(var)
var = maybe_encode_bools(var)
# TODO(kmuehlbauer): check if ensure_dtype_not_object can be moved to backends:
var = ensure_dtype_not_object(var, name=name)

for attr_name in CF_RELATED_DATA:
Expand Down Expand Up @@ -389,19 +276,15 @@ def decode_cf_variable(
if decode_times:
var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name)

dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)
# TODO(shoyer): convert everything below to use coders
if decode_endianness and not var.dtype.isnative:
var = variables.EndianCoder().decode(var)
original_dtype = var.dtype

if decode_endianness and not data.dtype.isnative:
# do this last, so it's only done if we didn't already unmask/scale
data = NativeEndiannessArray(data)
original_dtype = data.dtype
var = variables.BooleanCoder().decode(var)

encoding.setdefault("dtype", original_dtype)
dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)

if "dtype" in attributes and attributes["dtype"] == "bool":
del attributes["dtype"]
data = BoolTypeArray(data)
encoding.setdefault("dtype", original_dtype)

if not is_duck_dask_array(data):
data = indexing.LazilyIndexedArray(data)
Expand Down
Loading

0 comments on commit 551de70

Please sign in to comment.