Skip to content

Commit

Permalink
Merge pull request #17 from aaronspring/AS_cache
Browse files Browse the repository at this point in the history
caching
  • Loading branch information
Ian Rose authored Sep 11, 2020
2 parents 4332e03 + 6d685cc commit d5de6c5
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 7 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ script:
- |
flake8 .
if [ "$TRAVIS_OS_NAME" = "linux" ]; then
conda build -c defaults -c conda-forge ./conda
travis_wait 30 conda build -c defaults -c conda-forge ./conda
else
# Workaround for Travis-CI bug #2: https://github.com/travis-ci/travis-ci/issues/7773
conda build -c defaults -c conda-forge --no-test ./conda
Expand Down
2 changes: 2 additions & 0 deletions conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ test:
source_files:
- tests
requires:
- aiohttp
- pytest
- requests
commands:
- py.test --verbose

Expand Down
70 changes: 64 additions & 6 deletions intake_geopandas/geopandas.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# -*- coding: utf-8 -*-
from abc import ABC, abstractmethod

from intake.source.base import DataSource, Schema
import fsspec
import geopandas
from intake.source.base import DataSource, Schema

from . import __version__

Expand Down Expand Up @@ -50,8 +51,15 @@ def _close(self):


class GeoPandasFileSource(GeoPandasSource):
def __init__(self, urlpath, bbox=None,
geopandas_kwargs=None, metadata=None):
def __init__(
self,
urlpath,
use_fsspec=False,
storage_options=None,
bbox=None,
geopandas_kwargs=None,
metadata=None,
):
"""
Parameters
----------
Expand All @@ -60,25 +68,62 @@ def __init__(self, urlpath, bbox=None,
opened. Some examples:
- ``{{ CATALOG_DIR }}data/states.shp``
- ``http://some.domain.com/data/states.geo.json``
use_fsspec: bool
Whether to use fsspec to open `urlpath`. By default, `urlpath` is passed
directly to GeoPandas, which opens the file using `fiona`. However, for some
use cases it may be beneficial to read the file using `fsspec` before
passing the resulting bytes to GeoPandas (e.g., when using `fsspec` caching).
Note that fiona/GDAL and `fsspec` have mutually-incompatible URL chaining
syntaxes, so the URLs passed to each may be significantly different.
storage_options: dict
Storage options to pass to fsspec when opening. Only used when
`use_fsspec=True`.
bbox : tuple | GeoDataFrame or GeoSeries, default None
Filter features by given bounding box, GeoSeries, or GeoDataFrame.
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
geopandas_kwargs : dict
Any further arguments to pass to geopandas's read_file function.
"""
self.urlpath = urlpath
self._use_fsspec = use_fsspec
self._storage_options = storage_options or {}
self._bbox = bbox
self._geopandas_kwargs = geopandas_kwargs or {}
self._dataframe = None
self.storage_options = storage_options or {}

super().__init__(metadata=metadata)

def _open_dataset(self):
"""
Open dataset using geopandas and use pattern fields to set new columns.
Open dataset using geopandas.
"""
self._dataframe = geopandas.read_file(
self.urlpath, bbox=self._bbox, **self._geopandas_kwargs)
if self._use_fsspec:
with fsspec.open_files(self.urlpath, **self._storage_options) as f:
f = self._resolve_single_file(f) if len(f) > 1 else f[0]
self._dataframe = geopandas.read_file(
f,
bbox=self._bbox,
**self._geopandas_kwargs,
)
else:
self._dataframe = geopandas.read_file(
self.urlpath,
bbox=self._bbox,
**self._geopandas_kwargs
)

def _resolve_single_file(self, filelist):
"""
Given a list of fsspec OpenFiles, choose one to pass to geopandas.
"""
raise NotImplementedError(
"Opening multiple files is not supported by this driver"
)


class GeoJSONSource(GeoPandasFileSource):
Expand All @@ -88,6 +133,19 @@ class GeoJSONSource(GeoPandasFileSource):
class ShapefileSource(GeoPandasFileSource):
name = "shapefile"

def _resolve_single_file(self, filelist):
"""
Given a list of fsspec OpenFiles, find a .shp file.
"""
local_files = fsspec.open_local(self.urlpath, **self.storage_options)
for f in local_files:
if f.endswith(".shp"):
return f
raise ValueError(
f"No shapefile found in {filelist}, if you are using fsspec caching"
" consider using same_names=True"
)


class GeoPandasSQLSource(GeoPandasSource):
def __init__(self, uri, sql_expr=None, table=None,
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
intake
geopandas
pytest
fsspec
29 changes: 29 additions & 0 deletions tests/data/shape.catalog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,32 @@ sources:
driver: shape
args:
urlpath: '{CATALOG_PATH}/stations/stations.shp'

MEOW:
description: MEOW
driver: intake_geopandas.geopandas.ShapefileSource
args:
urlpath: http://maps.tnc.org/files/shp/MEOW-TNC.zip


MEOW_simplecache:
description: caching of the zipfile
driver: intake_geopandas.geopandas.ShapefileSource
args:
urlpath: simplecache::http://maps.tnc.org/files/shp/MEOW-TNC.zip
use_fsspec: true
storage_options:
simplecache:
same_names: true
cache_storage: /tmp/intake_geopandas


ALA_many_shapefiles_in_one_zip:
description: gadm36_ALA contains gadm36_ALA_0.shp and gadm36_ALA_1.shp, caches first
driver: intake_geopandas.geopandas.ShapefileSource
args:
urlpath: simplecache::zip://gadm36_ALA_0*::https://biogeo.ucdavis.edu/data/gadm3.6/shp/gadm36_ALA_shp.zip
storage_options:
simplecache:
same_names: true
cache_storage: /tmp/intake_geopandas
103 changes: 103 additions & 0 deletions tests/test_remote_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
import os
import shutil

import pytest

from intake_geopandas import GeoJSONSource, ShapefileSource

import geopandas

geopandas_version_allows_fsspec_caching = int(geopandas.__version__[:5].replace('.','')) > 81 # checks geopandas larger than 0.8.0

def try_clean_cache(item):
c = None
for c in ['filecache', 'simplecache']:
if c in item.storage_options:
caching = c
assert c is not None, 'caching not found'
path = item.storage_options[caching]['cache_storage']
if isinstance(path, str):
if os.path.exists(path):
shutil.rmtree(path)

@pytest.mark.skipif(not geopandas_version_allows_fsspec_caching, reason='requires geopandas release after 0.8.0')
@pytest.mark.parametrize(
'url',
[
'http://maps.tnc.org/files/shp/MEOW-TNC.zip',
'https://biogeo.ucdavis.edu/data/gadm3.6/shp/gadm36_ALA_shp.zip',
'zip://gadm36_ALA_0*::https://biogeo.ucdavis.edu/data/gadm3.6/shp/'
'gadm36_ALA_shp.zip',
],
ids=['url_zip', 'url_2_shp_zip', 'url_2_shp_extract_one_from_zip'],
)
@pytest.mark.parametrize('strategy', ['simplecache', 'filecache'])
def test_different_cachings_and_url(url, strategy):
"""Test different caching strategies for different urls."""
item = ShapefileSource(
f'{strategy}::{url}',
use_fsspec=True,
storage_options={strategy: {'same_names': True, 'cache_storage': 'tempfile'}},
)
expected_location = item.storage_options[strategy]['cache_storage']
try_clean_cache(item)
assert not os.path.exists(expected_location)
item.read()
assert os.path.exists(expected_location)
try_clean_cache(item)


@pytest.mark.parametrize('same_names', [False, True])
def test_same_name_required_else_error(same_names):
"""Test that same_names is required to load zip file from cache. Warns during init
if same_names is False for zip file."""
ShapefileSource_args = {
'urlpath': 'simplecache::zip://*::http://maps.tnc.org/files/shp/MEOW-TNC.zip',
'use_fsspec': True,
'storage_options': {
'simplecache': {'same_names': same_names, 'cache_storage': 'tmpfile'}
},
}
item = ShapefileSource(**ShapefileSource_args)
expected_location_on_disk = item.storage_options['simplecache']['cache_storage']
try_clean_cache(item)
assert not os.path.exists(expected_location_on_disk)
if not same_names:
# fiona expects paths ending with '.zip' or '.shp'
with pytest.raises(ValueError, match="same_names=True"):
item.read()
else:
item.read()
assert os.path.exists(expected_location_on_disk)
try_clean_cache(item)
assert not os.path.exists(expected_location_on_disk)


@pytest.fixture
def GeoJSONSource_countries_remote():
url = (
'simplecache::https://raw.githubusercontent.com/intake/'
'intake_geopandas/master/tests/data/countries.geo.json'
)
return GeoJSONSource(
**{
'urlpath': url,
'use_fsspec': True,
'storage_options': {'simplecache': {'cache_storage': 'tempfile'}},
}
)

@pytest.mark.skipif(not geopandas_version_allows_fsspec_caching, reason='requires geopandas release after 0.8.0')
@pytest.mark.parametrize('same_names', [False, True])
def test_remote_GeoJSONSource(GeoJSONSource_countries_remote, same_names):
"""GeoJSONSource works with either `same_names` True or False."""
item = GeoJSONSource_countries_remote
item.storage_options['simplecache']['same_names'] = same_names
expected_location_on_disk = item.storage_options['simplecache']['cache_storage']
try_clean_cache(item)
assert not os.path.exists(expected_location_on_disk)
item.read()
assert os.path.exists(expected_location_on_disk)
try_clean_cache(item)
assert not os.path.exists(expected_location_on_disk)

0 comments on commit d5de6c5

Please sign in to comment.