From ce975095024957367e95f4d994f7ce884e57352c Mon Sep 17 00:00:00 2001
From: Oliver Strickson <ostrickson@turing.ac.uk>
Date: Tue, 9 Nov 2021 18:45:09 +0000
Subject: [PATCH 1/5] Extend ImageSource to optionally read EXIF data from
 images

---
 intake_xarray/image.py            | 178 ++++++++++++++++++++++++------
 intake_xarray/tests/test_image.py |  32 ++++++
 2 files changed, 178 insertions(+), 32 deletions(-)

diff --git a/intake_xarray/image.py b/intake_xarray/image.py
index d925dfe..35b4f56 100644
--- a/intake_xarray/image.py
+++ b/intake_xarray/image.py
@@ -50,6 +50,11 @@ def _coerce_shape(array, shape):
     return new_array
 
 
+def _add_leading_dimension(x):
+    """Add a new dimension to an array-like"""
+    return x[None, ...]
+
+
 def _dask_imread(files, imread=None, preprocess=None, coerce_shape=None):
     """ Read a stack of images into a dask array """
     from dask.array import Array
@@ -63,9 +68,6 @@ def _imread(open_file):
         with open_file as f:
             return imread(f)
 
-    def add_leading_dimension(x):
-        return x[None, ...]
-
     filenames = [f.path for f in files]
 
     name = 'imread-%s' % tokenize(filenames)
@@ -85,23 +87,23 @@ def add_leading_dimension(x):
 
     if coerce_shape is not None:
         if preprocess:
-            values = [(add_leading_dimension,
+            values = [(_add_leading_dimension,
                        (preprocess,
                         (reshape,
                          (_imread, f))))
                       for f in files]
         else:
-            values = [(add_leading_dimension,
+            values = [(_add_leading_dimension,
                        (reshape,
                         (_imread, f)))
                       for f in files]
     elif preprocess:
-        values = [(add_leading_dimension,
+        values = [(_add_leading_dimension,
                    (preprocess,
                     (_imread, f)))
                   for f in files]
     else:
-        values = [(add_leading_dimension,
+        values = [(_add_leading_dimension,
                    (_imread, f))
                   for f in files]
     dsk = dict(zip(keys, values))
@@ -111,7 +113,54 @@ def add_leading_dimension(x):
     return Array(dsk, name, chunks, sample.dtype)
 
 
-def reader(file, chunks, imread=None, preprocess=None, coerce_shape=None):
+def _dask_exifread(files, exif_tags):
+    """Construct a dask Array to read each tag in `exif_tags` (list of
+    str) from the EXIF data of the images in `files`
+    """
+    from numpy import array
+    from dask.array import Array
+    from dask.base import tokenize
+    from exifread import process_file as read_exif
+
+    def _read_exif(open_file):
+        # Using the context manager (as below) occasionally results
+        # in 'I/O operation on closed file' and similar errors
+        # with open_file as f:
+        #     return read_exif(f)
+        #
+        f = open_file.open()
+        return read_exif(f)
+
+    if not isinstance(exif_tags, list):
+        sample = _read_exif(files[0])
+        exif_tags = sample.keys()
+
+    ntags = len(exif_tags)
+
+    def extract_tags(d):
+        return array([d.get(tag) for tag in exif_tags])
+
+    filenames = [f.path for f in files]
+    name = 'exifread-%s' % tokenize(filenames)
+
+    keys = [(name, i, 0) for i in range(len(files))]
+    values = [(_add_leading_dimension,
+               (extract_tags,
+                (_read_exif, f)))
+              for f in files]
+
+    dsk = dict(zip(keys, values))
+
+    chunks = ((1,) * len(files), (ntags,))
+
+    exif_data = Array(dsk, name, chunks, object)
+
+    return {'EXIF ' + tag: exif_data[:,i] for i, tag in enumerate(exif_tags)}
+
+
+def reader(
+    file, chunks, imread=None, preprocess=None, coerce_shape=None, exif_tags=None
+):
     """Read a file object and output an dask xarray object
 
     NOTE: inspired by dask.array.image.imread but altering the input to accept
@@ -135,14 +184,24 @@ def reader(file, chunks, imread=None, preprocess=None, coerce_shape=None):
     coerce_shape : tuple len 2 (optional)
         Optionally coerce the shape of the height and width of the image
         by setting `coerce_shape` to desired shape.
+    exif_tags : boolean or list of str (optional)
+        Controls whether exif tags are extracted from the images. If a
+        list, the elements are treated as the particular tags to
+        extract from each image. For any other truthy value, all tags
+        that were able to be extracted from a sample image are used.
+        When tags are extracted, an xarray Dataset is returned, with
+        each exif tag in a corresponding data variable of the Dataset,
+        (of type `Optional[exifread.classes.IfdTag]`), and the image
+        data in a data variable 'raster'.
 
     Returns
     -------
-    Dask xarray.DataArray of the image. Treated as one chunk unless
-    chunks kwarg is specified.
+    Dask xarray.DataArray or xarray.Dataset of the image, and
+    (optionally) the value of any requested EXIF tags. Treated as one
+    chunk unless chunks kwarg is specified.
     """
     import numpy as np
-    from xarray import DataArray
+    from xarray import DataArray, Dataset
 
     if not imread:
         from skimage.io import imread
@@ -164,10 +223,22 @@ def reader(file, chunks, imread=None, preprocess=None, coerce_shape=None):
         coords['channel'] = np.arange(nchannel)
         dims += ('channel',)
 
-    return DataArray(array, coords=coords, dims=dims).chunk(chunks=chunks)
+    if exif_tags:
+        exif_dict = _dask_exifread([file], exif_tags)
+        exif_dict_ds = {tag: ((), arr[0]) for tag, arr in exif_dict.items()}
+
+        return Dataset(
+            {
+                'raster': (dims, array),
+                **exif_dict_ds,
+            },
+            coords=coords,
+        ).chunk(chunks=chunks)
+    else:
+        return DataArray(array, coords=coords, dims=dims).chunk(chunks=chunks)
 
 
-def multireader(files, chunks, concat_dim, **kwargs):
+def multireader(files, chunks, concat_dim, exif_tags, **kwargs):
     """Read a stack of images into a dask xarray object
 
     NOTE: copied from dask.array.image.imread but altering the input to accept
@@ -196,15 +267,25 @@ def multireader(files, chunks, concat_dim, **kwargs):
     coerce_shape : iterable of len 2 (optional)
         Optionally coerce the shape of the height and width of the image
         by setting `coerce_shape` to desired shape.
+    exif_tags : boolean or list of str (optional)
+        Controls whether exif tags are extracted from the images. If a
+        list, the elements are treated as the particular tags to
+        extract from each image. For any other truthy value, all tags
+        that were able to be extracted from a sample image are used.
+        When tags are extracted, an xarray Dataset is returned, with
+        each exif tag in a corresponding data variable of the Dataset,
+        (of type `Optional[exifread.classes.IfdTag]`), and the image
+        data in a data variable 'raster'.
 
     Returns
     -------
-    Dask xarray.DataArray of all images stacked along the first dimension.
-    All images will be treated as individual chunks unless
-    chunks kwarg is specified.
+    A Dask xarray.DataArray or xarray.Dataset, of all images stacked
+    along the first dimension, and (optionally) the value of any
+    requested EXIF tags.  All images will be treated as individual
+    chunks unless chunks kwarg is specified.
     """
     import numpy as np
-    from xarray import DataArray
+    from xarray import DataArray, Dataset
 
     dask_array = _dask_imread(files, **kwargs)
 
@@ -212,16 +293,32 @@ def multireader(files, chunks, concat_dim, **kwargs):
     coords = {'y': np.arange(ny),
               'x': np.arange(nx)}
     if isinstance(concat_dim, list):
-        dims = ('dim_0', 'y', 'x')
+        dims = ('dim_0',)
     else:
-        dims = (concat_dim, 'y', 'x')
+        dims = (concat_dim,)
+        coords = {concat_dim: np.arange(dask_array.shape[0]),
+                  **coords}
 
+    raster_dims = dims + ('y', 'x')
     if len(dask_array.shape) == 4:
         nchannel = dask_array.shape[3]
         coords['channel'] = np.arange(nchannel)
-        dims += ('channel',)
-
-    return DataArray(dask_array, coords=coords, dims=dims).chunk(chunks=chunks)
+        raster_dims += ('channel',)
+
+    if exif_tags:
+        exif_dict = _dask_exifread(files, exif_tags)
+        exif_dict_ds = {tag: (dims, arr) for tag, arr in exif_dict.items()}
+        return Dataset(
+            {
+                'raster': (raster_dims, dask_array),
+                **exif_dict_ds,
+            },
+            coords=coords,
+        ).chunk(chunks=chunks)
+    else:
+        return DataArray(
+            dask_array, coords=coords, dims=raster_dims
+        ).chunk(chunks=chunks)
 
 
 class ImageSource(DataSourceMixin, PatternMixin):
@@ -268,17 +365,26 @@ class ImageSource(DataSourceMixin, PatternMixin):
     coerce_shape : iterable of len 2 (optional)
         Optionally coerce the shape of the height and width of the image
         by setting `coerce_shape` to desired shape.
+    exif_tags : boolean or list of str (optional)
+        Controls whether exif tags are extracted from the images. If a
+        list, the elements are treated as the particular tags to
+        extract from each image. For any other truthy value, all tags
+        that could be extracted from a sample image are used.  When
+        tags are extracted, an xarray Dataset is returned, with each
+        exif tag in the corresponding data variable, and the image
+        data in the data variable 'raster'.
     """
     name = 'xarray_image'
 
     def __init__(self, urlpath, chunks=None, concat_dim='concat_dim',
                  metadata=None, path_as_pattern=True,
-                 storage_options=None, **kwargs):
+                 storage_options=None, exif_tags=None, **kwargs):
         self.path_as_pattern = path_as_pattern
         self.urlpath = urlpath
         self.chunks = chunks
         self.concat_dim = concat_dim
         self.storage_options = storage_options or {}
+        self.exif_tags = exif_tags
         self._kwargs = kwargs
         self._ds = None
         super(ImageSource, self).__init__(metadata=metadata)
@@ -297,7 +403,9 @@ def _open_files(self, files):
         import pandas as pd
         from xarray import DataArray
 
-        out = multireader(files, self.chunks, self.concat_dim, **self._kwargs)
+        out = multireader(
+            files, self.chunks, self.concat_dim, self.exif_tags, **self._kwargs
+        )
         if not self.pattern:
             return out
 
@@ -325,7 +433,7 @@ def _open_files(self, files):
                 k: DataArray(v, dims=self.concat_dim)
                 for k, v in field_values.items()
             }
-            return out.assign_coords(**coords).chunk(self.chunks)
+            return out.assign_coords(**coords).chunk(self.chunks).unify_chunks()
 
     def _open_dataset(self):
         """
@@ -338,7 +446,9 @@ def _open_dataset(self):
         if len(files) == 0:
             raise Exception("No files found at {}".format(self.urlpath))
         if len(files) == 1:
-            self._ds = reader(files[0], self.chunks, **self._kwargs)
+            self._ds = reader(
+                files[0], self.chunks, exif_tags=self.exif_tags, **self._kwargs
+            )
         else:
             self._ds = self._open_files(files)
 
@@ -353,8 +463,12 @@ def _get_schema(self):
         if self._ds is None:
             self._open_dataset()
 
-            # convert to dataset for serialization
-            ds2 = xr.Dataset({'raster': self._ds})
+            # coerce to dataset for serialization
+            if isinstance(self._ds, xr.Dataset):
+                ds2 = self._ds
+            else:
+                ds2 = xr.Dataset({'raster': self._ds})
+
             metadata = {
                 'dims': dict(ds2.dims),
                 'data_vars': {k: list(ds2[k].coords)
@@ -364,7 +478,7 @@ def _get_schema(self):
             }
             if getattr(self, 'on_server', False):
                 metadata['internal'] = serialize_zarr_ds(ds2)
-            for k, v in self._ds.attrs.items():
+            for k, v in ds2.raster.attrs.items():
                 try:
                     # ensure only sending serializable attrs from remote
                     msgpack.packb(v)
@@ -373,9 +487,9 @@ def _get_schema(self):
                     pass
             self._schema = Schema(
                 datashape=None,
-                dtype=str(self._ds.dtype),
-                shape=self._ds.shape,
-                npartitions=self._ds.data.npartitions,
+                dtype=str(ds2.raster.dtype),
+                shape=ds2.raster.shape,
+                npartitions=ds2.raster.data.npartitions,
                 extra_metadata=metadata)
 
         return self._schema
diff --git a/intake_xarray/tests/test_image.py b/intake_xarray/tests/test_image.py
index 4c7c784..4fe6338 100644
--- a/intake_xarray/tests/test_image.py
+++ b/intake_xarray/tests/test_image.py
@@ -137,6 +137,29 @@ def test_read_image():
     assert array.dtype == np.uint8
 
 
+def test_read_image_and_exif():
+    pytest.importorskip('skimage')
+    urlpath = os.path.join(here, 'data', 'images', 'beach57.tif')
+    source = ImageSource(urlpath=urlpath, exif_tags=True)
+    ds = source.read()
+    assert ds['raster'].shape == (256, 252, 3)
+    assert ds['raster'].dtype == np.uint8
+    assert ds['EXIF Image ImageWidth'].item().values == [252]
+    assert ds['EXIF Image ImageLength'].item().values == [256]
+
+
+def test_read_image_and_given_exif_tag():
+    pytest.importorskip('skimage')
+    urlpath = os.path.join(here, 'data', 'images', 'beach57.tif')
+    source = ImageSource(urlpath=urlpath, exif_tags=['Image ImageWidth'])
+    ds = source.read()
+    assert ds['raster'].shape == (256, 252, 3)
+    assert ds['raster'].dtype == np.uint8
+    assert ds['EXIF Image ImageWidth'].item().values == [252]
+    with pytest.raises(KeyError):
+        ds['EXIF Image ImageLength']
+
+
 def test_read_images_as_glob_without_coerce_raises_error():
     pytest.importorskip('skimage')
     urlpath = os.path.join(here, 'data', 'images', '*')
@@ -152,3 +175,12 @@ def test_read_images_as_glob_with_coerce():
     source = ImageSource(urlpath=urlpath, coerce_shape=(256, 256))
     array = source.read()
     assert array.shape == (3, 256, 256, 3)
+
+
+def test_read_images_and_exif_as_glob_with_coerce():
+    pytest.importorskip('skimage')
+    urlpath = os.path.join(here, 'data', 'images', '*')
+    source = ImageSource(urlpath=urlpath, coerce_shape=(256, 256), exif_tags=True)
+    ds = source.read()
+    assert ds['raster'].shape == (3, 256, 256, 3)
+    assert ds['EXIF Image ImageWidth'].shape == (3,)

From 5a2b86d3406d8ed79c2f14b46a2e990fcc088b43 Mon Sep 17 00:00:00 2001
From: Oliver Strickson <ostrickson@turing.ac.uk>
Date: Tue, 9 Nov 2021 18:48:27 +0000
Subject: [PATCH 2/5] Add exifread to test environments

---
 ci/environment-py37.yml     | 1 +
 ci/environment-py38.yml     | 1 +
 ci/environment-py39.yml     | 1 +
 ci/environment-upstream.yml | 1 +
 4 files changed, 4 insertions(+)

diff --git a/ci/environment-py37.yml b/ci/environment-py37.yml
index fbde860..ef6d901 100644
--- a/ci/environment-py37.yml
+++ b/ci/environment-py37.yml
@@ -5,6 +5,7 @@ dependencies:
   - python=3.7
   - aiohttp
   - boto3
+  - exifread
   - flask
   - h5netcdf
   - intake
diff --git a/ci/environment-py38.yml b/ci/environment-py38.yml
index f7d1ee6..63b7069 100644
--- a/ci/environment-py38.yml
+++ b/ci/environment-py38.yml
@@ -5,6 +5,7 @@ dependencies:
   - python=3.8
   - aiohttp
   - boto3
+  - exifread
   - flask
   - h5netcdf
   - intake
diff --git a/ci/environment-py39.yml b/ci/environment-py39.yml
index 834c429..bef6c1a 100644
--- a/ci/environment-py39.yml
+++ b/ci/environment-py39.yml
@@ -5,6 +5,7 @@ dependencies:
   - python=3.9
   - aiohttp
   - boto3
+  - exifread
   - flask
   - h5netcdf
   - intake
diff --git a/ci/environment-upstream.yml b/ci/environment-upstream.yml
index 63f0833..8aa0115 100644
--- a/ci/environment-upstream.yml
+++ b/ci/environment-upstream.yml
@@ -5,6 +5,7 @@ dependencies:
   - python
   - aiohttp
   - boto3
+  - exifread
   - flask
   - h5netcdf
   - netcdf4

From edd168297d4c50b15ba5ab3c1f4ab17699fc924c Mon Sep 17 00:00:00 2001
From: Oliver Strickson <ostrickson@turing.ac.uk>
Date: Tue, 9 Nov 2021 18:57:26 +0000
Subject: [PATCH 3/5] Update docstring

---
 intake_xarray/image.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/intake_xarray/image.py b/intake_xarray/image.py
index 35b4f56..36e4032 100644
--- a/intake_xarray/image.py
+++ b/intake_xarray/image.py
@@ -369,10 +369,12 @@ class ImageSource(DataSourceMixin, PatternMixin):
         Controls whether exif tags are extracted from the images. If a
         list, the elements are treated as the particular tags to
         extract from each image. For any other truthy value, all tags
-        that could be extracted from a sample image are used.  When
-        tags are extracted, an xarray Dataset is returned, with each
-        exif tag in the corresponding data variable, and the image
-        data in the data variable 'raster'.
+        that were able to be extracted from a sample image are used.
+        When tags are extracted, an xarray Dataset is returned, with
+        each exif tag in a corresponding data variable of the Dataset,
+        (of type `Optional[exifread.classes.IfdTag]`), and the image
+        data in a data variable 'raster'.
+
     """
     name = 'xarray_image'
 

From ca0ffd1c308faee4955a710c1661153cf51d7e4c Mon Sep 17 00:00:00 2001
From: Oliver Strickson <ostrickson@turing.ac.uk>
Date: Fri, 19 Nov 2021 17:11:16 +0000
Subject: [PATCH 4/5] Bump s3fs version in test environments

---
 ci/environment-py37.yml     | 2 +-
 ci/environment-py38.yml     | 2 +-
 ci/environment-py39.yml     | 2 +-
 ci/environment-upstream.yml | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/environment-py37.yml b/ci/environment-py37.yml
index ef6d901..8d34d7c 100644
--- a/ci/environment-py37.yml
+++ b/ci/environment-py37.yml
@@ -14,7 +14,7 @@ dependencies:
   - pydap
   - pytest
   - rasterio
-  - s3fs
+  - s3fs >= 2021.08.0
   - scikit-image
   - xarray >= 0.17
   - zarr
diff --git a/ci/environment-py38.yml b/ci/environment-py38.yml
index 63b7069..f5f4180 100644
--- a/ci/environment-py38.yml
+++ b/ci/environment-py38.yml
@@ -14,7 +14,7 @@ dependencies:
   - pydap
   - pytest
   - rasterio
-  - s3fs
+  - s3fs >= 2021.08.0
   - scikit-image
   - xarray >= 0.17
   - zarr
diff --git a/ci/environment-py39.yml b/ci/environment-py39.yml
index bef6c1a..6e9a6e0 100644
--- a/ci/environment-py39.yml
+++ b/ci/environment-py39.yml
@@ -14,7 +14,7 @@ dependencies:
   - pydap
   - pytest
   - rasterio
-  - s3fs
+  - s3fs >= 2021.08.0
   - scikit-image
   - xarray >= 0.17
   - zarr
diff --git a/ci/environment-upstream.yml b/ci/environment-upstream.yml
index 8aa0115..04d3b21 100644
--- a/ci/environment-upstream.yml
+++ b/ci/environment-upstream.yml
@@ -13,7 +13,7 @@ dependencies:
   - pydap
   - pytest
   - rasterio
-  - s3fs
+  - s3fs >= 2021.08.0
   - scikit-image
   - zarr
   - pip:

From f21855c81b18200a2532f2b3baf472f0e6229b65 Mon Sep 17 00:00:00 2001
From: Oliver Strickson <ostrickson@turing.ac.uk>
Date: Fri, 19 Nov 2021 22:08:51 +0000
Subject: [PATCH 5/5] Take copy of OpenFile instance passed to _read_exif

---
 intake_xarray/image.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/intake_xarray/image.py b/intake_xarray/image.py
index 36e4032..673fb4e 100644
--- a/intake_xarray/image.py
+++ b/intake_xarray/image.py
@@ -117,19 +117,18 @@ def _dask_exifread(files, exif_tags):
     """Construct a dask Array to read each tag in `exif_tags` (list of
     str) from the EXIF data of the images in `files`
     """
+    from copy import copy
     from numpy import array
     from dask.array import Array
     from dask.base import tokenize
     from exifread import process_file as read_exif
 
     def _read_exif(open_file):
-        # Using the context manager (as below) occasionally results
-        # in 'I/O operation on closed file' and similar errors
-        # with open_file as f:
-        #     return read_exif(f)
-        #
-        f = open_file.open()
-        return read_exif(f)
+        # Take a fresh copy of open_file, to work around occasional
+        # 'I/O operation on closed file' and similar errors when
+        # open_file is also opened elsewhere
+        with copy(open_file) as f:
+            return read_exif(f)
 
     if not isinstance(exif_tags, list):
         sample = _read_exif(files[0])