From ce975095024957367e95f4d994f7ce884e57352c Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Tue, 9 Nov 2021 18:45:09 +0000 Subject: [PATCH 1/5] Extend ImageSource to optionally read EXIF data from images --- intake_xarray/image.py | 178 ++++++++++++++++++++++++------ intake_xarray/tests/test_image.py | 32 ++++++ 2 files changed, 178 insertions(+), 32 deletions(-) diff --git a/intake_xarray/image.py b/intake_xarray/image.py index d925dfe..35b4f56 100644 --- a/intake_xarray/image.py +++ b/intake_xarray/image.py @@ -50,6 +50,11 @@ def _coerce_shape(array, shape): return new_array +def _add_leading_dimension(x): + """Add a new dimension to an array-like""" + return x[None, ...] + + def _dask_imread(files, imread=None, preprocess=None, coerce_shape=None): """ Read a stack of images into a dask array """ from dask.array import Array @@ -63,9 +68,6 @@ def _imread(open_file): with open_file as f: return imread(f) - def add_leading_dimension(x): - return x[None, ...] - filenames = [f.path for f in files] name = 'imread-%s' % tokenize(filenames) @@ -85,23 +87,23 @@ def add_leading_dimension(x): if coerce_shape is not None: if preprocess: - values = [(add_leading_dimension, + values = [(_add_leading_dimension, (preprocess, (reshape, (_imread, f)))) for f in files] else: - values = [(add_leading_dimension, + values = [(_add_leading_dimension, (reshape, (_imread, f))) for f in files] elif preprocess: - values = [(add_leading_dimension, + values = [(_add_leading_dimension, (preprocess, (_imread, f))) for f in files] else: - values = [(add_leading_dimension, + values = [(_add_leading_dimension, (_imread, f)) for f in files] dsk = dict(zip(keys, values)) @@ -111,7 +113,54 @@ def add_leading_dimension(x): return Array(dsk, name, chunks, sample.dtype) -def reader(file, chunks, imread=None, preprocess=None, coerce_shape=None): +def _dask_exifread(files, exif_tags): + """Construct a dask Array to read each tag in `exif_tags` (list of + str) from the EXIF data of the images in `files` + """ + from numpy import array + from dask.array import Array + from dask.base import tokenize + from exifread import process_file as read_exif + + def _read_exif(open_file): + # Using the context manager (as below) occasionally results + # in 'I/O operation on closed file' and similar errors + # with open_file as f: + # return read_exif(f) + # + f = open_file.open() + return read_exif(f) + + if not isinstance(exif_tags, list): + sample = _read_exif(files[0]) + exif_tags = sample.keys() + + ntags = len(exif_tags) + + def extract_tags(d): + return array([d.get(tag) for tag in exif_tags]) + + filenames = [f.path for f in files] + name = 'exifread-%s' % tokenize(filenames) + + keys = [(name, i, 0) for i in range(len(files))] + values = [(_add_leading_dimension, + (extract_tags, + (_read_exif, f))) + for f in files] + + dsk = dict(zip(keys, values)) + + chunks = ((1,) * len(files), (ntags,)) + + exif_data = Array(dsk, name, chunks, object) + + return {'EXIF ' + tag: exif_data[:,i] for i, tag in enumerate(exif_tags)} + + +def reader( + file, chunks, imread=None, preprocess=None, coerce_shape=None, exif_tags=None +): """Read a file object and output an dask xarray object NOTE: inspired by dask.array.image.imread but altering the input to accept @@ -135,14 +184,24 @@ def reader(file, chunks, imread=None, preprocess=None, coerce_shape=None): coerce_shape : tuple len 2 (optional) Optionally coerce the shape of the height and width of the image by setting `coerce_shape` to desired shape. + exif_tags : boolean or list of str (optional) + Controls whether exif tags are extracted from the images. If a + list, the elements are treated as the particular tags to + extract from each image. For any other truthy value, all tags + that were able to be extracted from a sample image are used. + When tags are extracted, an xarray Dataset is returned, with + each exif tag in a corresponding data variable of the Dataset, + (of type `Optional[exifread.classes.IfdTag]`), and the image + data in a data variable 'raster'. Returns ------- - Dask xarray.DataArray of the image. Treated as one chunk unless - chunks kwarg is specified. + Dask xarray.DataArray or xarray.Dataset of the image, and + (optionally) the value of any requested EXIF tags. Treated as one + chunk unless chunks kwarg is specified. """ import numpy as np - from xarray import DataArray + from xarray import DataArray, Dataset if not imread: from skimage.io import imread @@ -164,10 +223,22 @@ def reader(file, chunks, imread=None, preprocess=None, coerce_shape=None): coords['channel'] = np.arange(nchannel) dims += ('channel',) - return DataArray(array, coords=coords, dims=dims).chunk(chunks=chunks) + if exif_tags: + exif_dict = _dask_exifread([file], exif_tags) + exif_dict_ds = {tag: ((), arr[0]) for tag, arr in exif_dict.items()} + + return Dataset( + { + 'raster': (dims, array), + **exif_dict_ds, + }, + coords=coords, + ).chunk(chunks=chunks) + else: + return DataArray(array, coords=coords, dims=dims).chunk(chunks=chunks) -def multireader(files, chunks, concat_dim, **kwargs): +def multireader(files, chunks, concat_dim, exif_tags, **kwargs): """Read a stack of images into a dask xarray object NOTE: copied from dask.array.image.imread but altering the input to accept @@ -196,15 +267,25 @@ def multireader(files, chunks, concat_dim, **kwargs): coerce_shape : iterable of len 2 (optional) Optionally coerce the shape of the height and width of the image by setting `coerce_shape` to desired shape. + exif_tags : boolean or list of str (optional) + Controls whether exif tags are extracted from the images. If a + list, the elements are treated as the particular tags to + extract from each image. For any other truthy value, all tags + that were able to be extracted from a sample image are used. + When tags are extracted, an xarray Dataset is returned, with + each exif tag in a corresponding data variable of the Dataset, + (of type `Optional[exifread.classes.IfdTag]`), and the image + data in a data variable 'raster'. Returns ------- - Dask xarray.DataArray of all images stacked along the first dimension. - All images will be treated as individual chunks unless - chunks kwarg is specified. + A Dask xarray.DataArray or xarray.Dataset, of all images stacked + along the first dimension, and (optionally) the value of any + requested EXIF tags. All images will be treated as individual + chunks unless chunks kwarg is specified. """ import numpy as np - from xarray import DataArray + from xarray import DataArray, Dataset dask_array = _dask_imread(files, **kwargs) @@ -212,16 +293,32 @@ def multireader(files, chunks, concat_dim, **kwargs): coords = {'y': np.arange(ny), 'x': np.arange(nx)} if isinstance(concat_dim, list): - dims = ('dim_0', 'y', 'x') + dims = ('dim_0',) else: - dims = (concat_dim, 'y', 'x') + dims = (concat_dim,) + coords = {concat_dim: np.arange(dask_array.shape[0]), + **coords} + raster_dims = dims + ('y', 'x') if len(dask_array.shape) == 4: nchannel = dask_array.shape[3] coords['channel'] = np.arange(nchannel) - dims += ('channel',) - - return DataArray(dask_array, coords=coords, dims=dims).chunk(chunks=chunks) + raster_dims += ('channel',) + + if exif_tags: + exif_dict = _dask_exifread(files, exif_tags) + exif_dict_ds = {tag: (dims, arr) for tag, arr in exif_dict.items()} + return Dataset( + { + 'raster': (raster_dims, dask_array), + **exif_dict_ds, + }, + coords=coords, + ).chunk(chunks=chunks) + else: + return DataArray( + dask_array, coords=coords, dims=raster_dims + ).chunk(chunks=chunks) class ImageSource(DataSourceMixin, PatternMixin): @@ -268,17 +365,26 @@ class ImageSource(DataSourceMixin, PatternMixin): coerce_shape : iterable of len 2 (optional) Optionally coerce the shape of the height and width of the image by setting `coerce_shape` to desired shape. + exif_tags : boolean or list of str (optional) + Controls whether exif tags are extracted from the images. If a + list, the elements are treated as the particular tags to + extract from each image. For any other truthy value, all tags + that could be extracted from a sample image are used. When + tags are extracted, an xarray Dataset is returned, with each + exif tag in the corresponding data variable, and the image + data in the data variable 'raster'. """ name = 'xarray_image' def __init__(self, urlpath, chunks=None, concat_dim='concat_dim', metadata=None, path_as_pattern=True, - storage_options=None, **kwargs): + storage_options=None, exif_tags=None, **kwargs): self.path_as_pattern = path_as_pattern self.urlpath = urlpath self.chunks = chunks self.concat_dim = concat_dim self.storage_options = storage_options or {} + self.exif_tags = exif_tags self._kwargs = kwargs self._ds = None super(ImageSource, self).__init__(metadata=metadata) @@ -297,7 +403,9 @@ def _open_files(self, files): import pandas as pd from xarray import DataArray - out = multireader(files, self.chunks, self.concat_dim, **self._kwargs) + out = multireader( + files, self.chunks, self.concat_dim, self.exif_tags, **self._kwargs + ) if not self.pattern: return out @@ -325,7 +433,7 @@ def _open_files(self, files): k: DataArray(v, dims=self.concat_dim) for k, v in field_values.items() } - return out.assign_coords(**coords).chunk(self.chunks) + return out.assign_coords(**coords).chunk(self.chunks).unify_chunks() def _open_dataset(self): """ @@ -338,7 +446,9 @@ def _open_dataset(self): if len(files) == 0: raise Exception("No files found at {}".format(self.urlpath)) if len(files) == 1: - self._ds = reader(files[0], self.chunks, **self._kwargs) + self._ds = reader( + files[0], self.chunks, exif_tags=self.exif_tags, **self._kwargs + ) else: self._ds = self._open_files(files) @@ -353,8 +463,12 @@ def _get_schema(self): if self._ds is None: self._open_dataset() - # convert to dataset for serialization - ds2 = xr.Dataset({'raster': self._ds}) + # coerce to dataset for serialization + if isinstance(self._ds, xr.Dataset): + ds2 = self._ds + else: + ds2 = xr.Dataset({'raster': self._ds}) + metadata = { 'dims': dict(ds2.dims), 'data_vars': {k: list(ds2[k].coords) @@ -364,7 +478,7 @@ def _get_schema(self): } if getattr(self, 'on_server', False): metadata['internal'] = serialize_zarr_ds(ds2) - for k, v in self._ds.attrs.items(): + for k, v in ds2.raster.attrs.items(): try: # ensure only sending serializable attrs from remote msgpack.packb(v) @@ -373,9 +487,9 @@ def _get_schema(self): pass self._schema = Schema( datashape=None, - dtype=str(self._ds.dtype), - shape=self._ds.shape, - npartitions=self._ds.data.npartitions, + dtype=str(ds2.raster.dtype), + shape=ds2.raster.shape, + npartitions=ds2.raster.data.npartitions, extra_metadata=metadata) return self._schema diff --git a/intake_xarray/tests/test_image.py b/intake_xarray/tests/test_image.py index 4c7c784..4fe6338 100644 --- a/intake_xarray/tests/test_image.py +++ b/intake_xarray/tests/test_image.py @@ -137,6 +137,29 @@ def test_read_image(): assert array.dtype == np.uint8 +def test_read_image_and_exif(): + pytest.importorskip('skimage') + urlpath = os.path.join(here, 'data', 'images', 'beach57.tif') + source = ImageSource(urlpath=urlpath, exif_tags=True) + ds = source.read() + assert ds['raster'].shape == (256, 252, 3) + assert ds['raster'].dtype == np.uint8 + assert ds['EXIF Image ImageWidth'].item().values == [252] + assert ds['EXIF Image ImageLength'].item().values == [256] + + +def test_read_image_and_given_exif_tag(): + pytest.importorskip('skimage') + urlpath = os.path.join(here, 'data', 'images', 'beach57.tif') + source = ImageSource(urlpath=urlpath, exif_tags=['Image ImageWidth']) + ds = source.read() + assert ds['raster'].shape == (256, 252, 3) + assert ds['raster'].dtype == np.uint8 + assert ds['EXIF Image ImageWidth'].item().values == [252] + with pytest.raises(KeyError): + ds['EXIF Image ImageLength'] + + def test_read_images_as_glob_without_coerce_raises_error(): pytest.importorskip('skimage') urlpath = os.path.join(here, 'data', 'images', '*') @@ -152,3 +175,12 @@ def test_read_images_as_glob_with_coerce(): source = ImageSource(urlpath=urlpath, coerce_shape=(256, 256)) array = source.read() assert array.shape == (3, 256, 256, 3) + + +def test_read_images_and_exif_as_glob_with_coerce(): + pytest.importorskip('skimage') + urlpath = os.path.join(here, 'data', 'images', '*') + source = ImageSource(urlpath=urlpath, coerce_shape=(256, 256), exif_tags=True) + ds = source.read() + assert ds['raster'].shape == (3, 256, 256, 3) + assert ds['EXIF Image ImageWidth'].shape == (3,) From 5a2b86d3406d8ed79c2f14b46a2e990fcc088b43 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Tue, 9 Nov 2021 18:48:27 +0000 Subject: [PATCH 2/5] Add exifread to test environments --- ci/environment-py37.yml | 1 + ci/environment-py38.yml | 1 + ci/environment-py39.yml | 1 + ci/environment-upstream.yml | 1 + 4 files changed, 4 insertions(+) diff --git a/ci/environment-py37.yml b/ci/environment-py37.yml index fbde860..ef6d901 100644 --- a/ci/environment-py37.yml +++ b/ci/environment-py37.yml @@ -5,6 +5,7 @@ dependencies: - python=3.7 - aiohttp - boto3 + - exifread - flask - h5netcdf - intake diff --git a/ci/environment-py38.yml b/ci/environment-py38.yml index f7d1ee6..63b7069 100644 --- a/ci/environment-py38.yml +++ b/ci/environment-py38.yml @@ -5,6 +5,7 @@ dependencies: - python=3.8 - aiohttp - boto3 + - exifread - flask - h5netcdf - intake diff --git a/ci/environment-py39.yml b/ci/environment-py39.yml index 834c429..bef6c1a 100644 --- a/ci/environment-py39.yml +++ b/ci/environment-py39.yml @@ -5,6 +5,7 @@ dependencies: - python=3.9 - aiohttp - boto3 + - exifread - flask - h5netcdf - intake diff --git a/ci/environment-upstream.yml b/ci/environment-upstream.yml index 63f0833..8aa0115 100644 --- a/ci/environment-upstream.yml +++ b/ci/environment-upstream.yml @@ -5,6 +5,7 @@ dependencies: - python - aiohttp - boto3 + - exifread - flask - h5netcdf - netcdf4 From edd168297d4c50b15ba5ab3c1f4ab17699fc924c Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Tue, 9 Nov 2021 18:57:26 +0000 Subject: [PATCH 3/5] Update docstring --- intake_xarray/image.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/intake_xarray/image.py b/intake_xarray/image.py index 35b4f56..36e4032 100644 --- a/intake_xarray/image.py +++ b/intake_xarray/image.py @@ -369,10 +369,12 @@ class ImageSource(DataSourceMixin, PatternMixin): Controls whether exif tags are extracted from the images. If a list, the elements are treated as the particular tags to extract from each image. For any other truthy value, all tags - that could be extracted from a sample image are used. When - tags are extracted, an xarray Dataset is returned, with each - exif tag in the corresponding data variable, and the image - data in the data variable 'raster'. + that were able to be extracted from a sample image are used. + When tags are extracted, an xarray Dataset is returned, with + each exif tag in a corresponding data variable of the Dataset, + (of type `Optional[exifread.classes.IfdTag]`), and the image + data in a data variable 'raster'. + """ name = 'xarray_image' From ca0ffd1c308faee4955a710c1661153cf51d7e4c Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Fri, 19 Nov 2021 17:11:16 +0000 Subject: [PATCH 4/5] Bump s3fs version in test environments --- ci/environment-py37.yml | 2 +- ci/environment-py38.yml | 2 +- ci/environment-py39.yml | 2 +- ci/environment-upstream.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/environment-py37.yml b/ci/environment-py37.yml index ef6d901..8d34d7c 100644 --- a/ci/environment-py37.yml +++ b/ci/environment-py37.yml @@ -14,7 +14,7 @@ dependencies: - pydap - pytest - rasterio - - s3fs + - s3fs >= 2021.08.0 - scikit-image - xarray >= 0.17 - zarr diff --git a/ci/environment-py38.yml b/ci/environment-py38.yml index 63b7069..f5f4180 100644 --- a/ci/environment-py38.yml +++ b/ci/environment-py38.yml @@ -14,7 +14,7 @@ dependencies: - pydap - pytest - rasterio - - s3fs + - s3fs >= 2021.08.0 - scikit-image - xarray >= 0.17 - zarr diff --git a/ci/environment-py39.yml b/ci/environment-py39.yml index bef6c1a..6e9a6e0 100644 --- a/ci/environment-py39.yml +++ b/ci/environment-py39.yml @@ -14,7 +14,7 @@ dependencies: - pydap - pytest - rasterio - - s3fs + - s3fs >= 2021.08.0 - scikit-image - xarray >= 0.17 - zarr diff --git a/ci/environment-upstream.yml b/ci/environment-upstream.yml index 8aa0115..04d3b21 100644 --- a/ci/environment-upstream.yml +++ b/ci/environment-upstream.yml @@ -13,7 +13,7 @@ dependencies: - pydap - pytest - rasterio - - s3fs + - s3fs >= 2021.08.0 - scikit-image - zarr - pip: From f21855c81b18200a2532f2b3baf472f0e6229b65 Mon Sep 17 00:00:00 2001 From: Oliver Strickson Date: Fri, 19 Nov 2021 22:08:51 +0000 Subject: [PATCH 5/5] Take copy of OpenFile instance passed to _read_exif --- intake_xarray/image.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/intake_xarray/image.py b/intake_xarray/image.py index 36e4032..673fb4e 100644 --- a/intake_xarray/image.py +++ b/intake_xarray/image.py @@ -117,19 +117,18 @@ def _dask_exifread(files, exif_tags): """Construct a dask Array to read each tag in `exif_tags` (list of str) from the EXIF data of the images in `files` """ + from copy import copy from numpy import array from dask.array import Array from dask.base import tokenize from exifread import process_file as read_exif def _read_exif(open_file): - # Using the context manager (as below) occasionally results - # in 'I/O operation on closed file' and similar errors - # with open_file as f: - # return read_exif(f) - # - f = open_file.open() - return read_exif(f) + # Take a fresh copy of open_file, to work around occasional + # 'I/O operation on closed file' and similar errors when + # open_file is also opened elsewhere + with copy(open_file) as f: + return read_exif(f) if not isinstance(exif_tags, list): sample = _read_exif(files[0])