Skip to content

Commit

Permalink
Read small integers as float32, not float64 (#1840)
Browse files Browse the repository at this point in the history
* Read small integers as float32, not float64

AKA the "I just wasted 4.6 TB of memory" patch.

* Tests now include a clean flake8 run
  • Loading branch information
Zac-HD authored and Joe Hamman committed Jan 23, 2018
1 parent b55143d commit 65e5f05
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 8 deletions.
1 change: 0 additions & 1 deletion .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
- [ ] Closes #xxxx (remove if there is no corresponding issue, which should only be the case for minor changes)
- [ ] Tests added (for all bug fixes or enhancements)
- [ ] Tests passed (for all non-documentation changes)
- [ ] Passes ``git diff upstream/master **/*py | flake8 --diff`` (remove if you did not edit any Python files)
- [ ] Fully documented, including `whats-new.rst` for all changes and `api.rst` for new API (remove if this change should not be visible to users, e.g., if it is an internal clean-up, or if this is part of a larger project that will be documented later)
9 changes: 6 additions & 3 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ Enhancements
- :py:func:`~plot.line()` learned to draw multiple lines if provided with a
2D variable.
By `Deepak Cherian <https://github.com/dcherian>`_.
- Reduce memory usage when decoding a variable with a scale_factor, by
converting 8-bit and 16-bit integers to float32 instead of float64
(:pull:`1840`), and keeping float16 and float32 as float32 (:issue:`1842`).
Correspondingly, encoded variables may also be saved with a smaller dtype.
By `Zac Hatfield-Dodds <https://github.com/Zac-HD>`_.

.. _Zarr: http://zarr.readthedocs.io/

Expand All @@ -76,14 +81,12 @@ Bug fixes
- Fixed encoding of multi-dimensional coordinates in
:py:meth:`~Dataset.to_netcdf` (:issue:`1763`).
By `Mike Neish <https://github.com/neishm>`_.

- Fixed chunking with non-file-based rasterio datasets (:issue:`1816`) and
refactored rasterio test suite.
By `Ryan Abernathey <https://github.com/rabernat>`_
- Bug fix in open_dataset(engine='pydap') (:issue:`1775`)
By `Keisuke Fujii <https://github.com/fujiisoup>`_.

- Bug fix in vectorized assignment (:issue:`1743`, `1744`).
- Bug fix in vectorized assignment (:issue:`1743`, :issue:`1744`).
Now item assignment to :py:meth:`~DataArray.__setitem__` checks
- Bug fix in vectorized assignment (:issue:`1743`, :issue:`1744`).
Now item assignment to :py:meth:`DataArray.__setitem__` checks
Expand Down
24 changes: 22 additions & 2 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,25 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype):
return data


def _choose_float_dtype(dtype, has_offset):
"""Return a float dtype that can losslessly represent `dtype` values."""
# Keep float32 as-is. Upcast half-precision to single-precision,
# because float16 is "intended for storage but not computation"
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
return np.float32
# float32 can exactly represent all integers up to 24 bits
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if not has_offset:
return np.float32
# For all other types and circumstances, we just use float64.
# (safe because eg. complex numbers are not supported in NetCDF)
return np.float64


class CFScaleOffsetCoder(VariableCoder):
"""Scale and offset variables according to CF conventions.
Expand All @@ -216,7 +235,8 @@ def encode(self, variable, name=None):
dims, data, attrs, encoding = unpack_for_encoding(variable)

if 'scale_factor' in encoding or 'add_offset' in encoding:
data = data.astype(dtype=np.float64, copy=True)
dtype = _choose_float_dtype(data.dtype, 'add_offset' in encoding)
data = data.astype(dtype=dtype, copy=True)
if 'add_offset' in encoding:
data -= pop_to(encoding, attrs, 'add_offset', name=name)
if 'scale_factor' in encoding:
Expand All @@ -230,7 +250,7 @@ def decode(self, variable, name=None):
if 'scale_factor' in attrs or 'add_offset' in attrs:
scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name)
add_offset = pop_to(attrs, encoding, 'add_offset', name=name)
dtype = np.float64
dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs)
transform = partial(_scale_offset_decoding,
scale_factor=scale_factor,
add_offset=add_offset,
Expand Down
4 changes: 2 additions & 2 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def open_example_dataset(name, *args, **kwargs):


def create_masked_and_scaled_data():
x = np.array([np.nan, np.nan, 10, 10.1, 10.2])
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
encoding = {'_FillValue': -1, 'add_offset': 10,
'scale_factor': np.float32(0.1), 'dtype': 'i2'}
return Dataset({'x': ('t', x, {}, encoding)})
Expand All @@ -80,7 +80,7 @@ def create_encoded_masked_and_scaled_data():
def create_unsigned_masked_scaled_data():
encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1',
'add_offset': 10, 'scale_factor': np.float32(0.1)}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan])
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
return Dataset({'x': ('t', x, {}, encoding)})


Expand Down
14 changes: 14 additions & 0 deletions xarray/tests/test_coding.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np

import pytest

import xarray as xr
from xarray.core.pycompat import suppress
from xarray.coding import variables
Expand Down Expand Up @@ -36,3 +38,15 @@ def test_coder_roundtrip():
coder = variables.CFMaskCoder()
roundtripped = coder.decode(coder.encode(original))
assert_identical(original, roundtripped)


@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split())
def test_scaling_converts_to_float32(dtype):
original = xr.Variable(('x',), np.arange(10, dtype=dtype),
encoding=dict(scale_factor=10))
coder = variables.CFScaleOffsetCoder()
encoded = coder.encode(original)
assert encoded.dtype == np.float32
roundtripped = coder.decode(encoded)
assert_identical(original, roundtripped)
assert roundtripped.dtype == np.float32

0 comments on commit 65e5f05

Please sign in to comment.