Merge pull request #2 from ian-r-rose/multiple-input-sources

Multiple input sources
intake · Apr 11, 2019 · 42b31d8 · 42b31d8
2 parents 792eae5 + dc2629e
commit 42b31d8
Show file tree

Hide file tree

Showing 9 changed files with 412 additions and 74 deletions.
diff --git a/README.md b/README.md
@@ -6,8 +6,12 @@ intake_geopandas: [Geopandas](http://geopandas.org/) plugin for [Intake](https:/
 
 See [Intake docs](https://intake.readthedocs.io/en/latest/overview.html).
 
-In `intake_geopandas`, there are plugins provided for reading manifests into a geopandas dataframe
-  - Shape Files
+In `intake_geopandas`, there are plugins provided for reading geospatial datasets into a geopandas dataframe.
+It currently supports reading from the following data sources:
+  - GeoJSON files
+  - PostGIS databases
+  - ESRI Shapefiles
+  - Spatialite databases
 
 ### Installation
 

diff --git a/intake_geopandas/__init__.py b/intake_geopandas/__init__.py
@@ -3,6 +3,15 @@
 __version__ = get_versions()['version']
 del get_versions
 
-from .geopandas import ShapeSource
-
-import intake.container
+from .geopandas import (
+    GeoJSONSource,
+    PostGISSource,
+    ShapefileSource,
+    SpatiaLiteSource
+)
+__all__ = [
+    'GeoJSONSource',
+    'PostGISSource',
+    'ShapefileSource',
+    'SpatiaLiteSource'
+]
diff --git a/intake_geopandas/_version.py b/intake_geopandas/_version.py
@@ -291,7 +291,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         if not mo:
             # unparseable. Maybe git-describe is misbehaving?
             pieces["error"] = (
-                    "unable to parse git-describe output: '%s'" % describe_out)
+                "unable to parse git-describe output: '%s'" % describe_out)
             return pieces
 
         # tag

diff --git a/intake_geopandas/geopandas.py b/intake_geopandas/geopandas.py
@@ -1,52 +1,30 @@
 # -*- coding: utf-8 -*-
-from . import __version__
+from abc import ABC, abstractmethod
+
 from intake.source.base import DataSource, Schema
+import geopandas
 
-import json
-import dask.dataframe as dd
-from datetime import datetime, timedelta
+from . import __version__
 
 
-class ShapeSource(DataSource):
-    """Shape file intake source"""
-    name = 'shape'
+class GeoPandasSource(DataSource, ABC):
+    """
+    Base class intake source for loading GeoDataFrames.
+    """
     version = __version__
     container = 'dataframe'
     partition_access = True
 
-    def __init__(self, urlpath, bbox=None, geopandas_kwargs=None, metadata=None):
+    @abstractmethod
+    def _open_dataset(self):
         """
-        Parameters
-        ----------
-        urlpath : str or iterable, location of data
-            Either the absolute or relative path to the file or URL to be opened.
-            Some examples:
-            - ``{{ CATALOG_DIR }}data/states.shp``
-            - ``http://some.domain.com/data/dtates.shp``
-        bbox : tuple | GeoDataFrame or GeoSeries, default None
-            Filter features by given bounding box, GeoSeries, or GeoDataFrame.
-            CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
-        geopandas_kwargs : dict
-            Any further arguments to pass to geopandas's read_file.
+        Open dataset using geopandas and use pattern fields to set new columns.
         """
-        self.urlpath = urlpath
-        self._bbox = bbox
-        self._geopandas_kwargs = geopandas_kwargs or {}
-        self._dataframe = None
-
-        super(ShapeSource, self).__init__(metadata=metadata)
-
-    def _open_dataset(self, urlpath):
-        """Open dataset using geopandas and use pattern fields to set new columns
-        """
-        import geopandas
-
-        self._dataframe = geopandas.read_file(
-            urlpath, bbox=self._bbox, **self._geopandas_kwargs)
+        raise NotImplementedError('GeoPandasSource is an abstract class')
 
     def _get_schema(self):
         if self._dataframe is None:
-            self._open_dataset(self.urlpath)
+            self._open_dataset()
 
         dtypes = self._dataframe.dtypes.to_dict()
         dtypes = {n: str(t) for (n, t) in dtypes.items()}
@@ -69,3 +47,86 @@ def to_dask(self):
 
     def _close(self):
         self._dataframe = None
+
+
+class GeoPandasFileSource(GeoPandasSource):
+    def __init__(self, urlpath, bbox=None,
+                 geopandas_kwargs=None, metadata=None):
+        """
+        Parameters
+        ----------
+        urlpath : str or iterable, location of data
+            Either the absolute or relative path to the file or URL to be
+            opened. Some examples:
+            - ``{{ CATALOG_DIR }}data/states.shp``
+            - ``http://some.domain.com/data/states.geo.json``
+        bbox : tuple | GeoDataFrame or GeoSeries, default None
+            Filter features by given bounding box, GeoSeries, or GeoDataFrame.
+            CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
+        geopandas_kwargs : dict
+            Any further arguments to pass to geopandas's read_file function.
+        """
+        self.urlpath = urlpath
+        self._bbox = bbox
+        self._geopandas_kwargs = geopandas_kwargs or {}
+        self._dataframe = None
+
+        super().__init__(metadata=metadata)
+
+    def _open_dataset(self):
+        """
+        Open dataset using geopandas and use pattern fields to set new columns.
+        """
+        self._dataframe = geopandas.read_file(
+            self.urlpath, bbox=self._bbox, **self._geopandas_kwargs)
+
+
+class GeoJSONSource(GeoPandasFileSource):
+    name = "geojson"
+
+
+class ShapefileSource(GeoPandasFileSource):
+    name = "shapefile"
+
+
+class GeoPandasSQLSource(GeoPandasSource):
+    def __init__(self, uri, sql_expr=None, table=None,
+                 geopandas_kwargs=None, metadata=None):
+        """
+        Parameters
+        ----------
+        uri : str
+            The connection string for the PostGIS database.
+        sql_expr: str, optional
+            The SQL expression used to load from the database.
+            Must include either `sql_expr` or `table`.
+        table: str, optional
+            The table to load from the database.
+            This is ignored if `sql_expr` is provided.
+        geopandas_kwargs : dict
+            Any further arguments to pass to geopandas's read_postgis function.
+        """
+        self.uri = uri
+        if sql_expr:
+            self.sql_expr = sql_expr
+        elif table:
+            self.sql_expr = f"SELECT * FROM {table}"
+        else:
+            raise ValueError("Must provide either a sql_expr or a table")
+
+        self._geopandas_kwargs = geopandas_kwargs or {}
+        self._dataframe = None
+
+        super().__init__(metadata=metadata)
+
+    def _open_dataset(self):
+        self._dataframe = geopandas.read_postgis(
+            self.sql_expr, self.uri, **self._geopandas_kwargs)
+
+
+class PostGISSource(GeoPandasSQLSource):
+    name = "postgis"
+
+
+class SpatiaLiteSource(GeoPandasSQLSource):
+    name = "spatialite"
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,3 @@
 intake
-dask
-requests
-pandas
-scipy
+geopandas
 pytest
diff --git a/tests/data/countries.geo.json b/tests/data/countries.geo.json
diff --git a/tests/test_file_source.py b/tests/test_file_source.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+import os
+import pytest
+
+from intake_geopandas import GeoJSONSource, ShapefileSource
+
+
+@pytest.fixture
+def shape_filenames():
+    basedir = os.path.dirname(__file__)
+    return dict(
+        stations=os.path.join(basedir, 'data', 'stations', 'stations.shp')
+    )
+
+
+@pytest.fixture
+def shape_datasource(shape_filenames):
+    return ShapefileSource(shape_filenames['stations'])
+
+
+@pytest.fixture
+def geojson_filenames():
+    basedir = os.path.dirname(__file__)
+    return dict(countries=os.path.join(basedir, 'data', 'countries.geo.json'))
+
+
+@pytest.fixture
+def geojson_datasource(geojson_filenames):
+    return GeoJSONSource(geojson_filenames['countries'])
+
+
+def test_shape_datasource(shape_datasource):
+    info = shape_datasource.discover()
+
+    assert info['dtype'] == {'name': 'object',
+                             'marker-col': 'object',
+                             'marker-sym': 'object',
+                             'line': 'object',
+                             'geometry': 'object'}
+
+
+def test_countries_datasource(geojson_datasource):
+    info = geojson_datasource.discover()
+    print(info)
diff --git a/tests/test_source.py b/tests/test_source.py
diff --git a/tests/test_sql_source.py b/tests/test_sql_source.py
@@ -0,0 +1,70 @@
+"""
+Tests for loading data from SQL data sources.
+Modified from those in the GeoPandas test suite.
+
+In order to run, SpatiaLite must be installed and configured.
+"""
+import pytest
+
+import geopandas
+from geopandas import read_file
+
+from intake_geopandas import SpatiaLiteSource
+
+
+
+@pytest.fixture
+def df_nybb():
+    nybb_path = geopandas.datasets.get_path('nybb')
+    df = read_file(nybb_path)
+    return df
+
+
+# Expected to fail until there is a geopandas release
+# with geopandas/geopandas#856
+@pytest.mark.xfail
+def test_read_spatialite_null_geom(df_nybb):
+    """Tests that geometry with NULL is accepted."""
+    try:
+        from geopandas.tests.util import (
+            connect_spatialite, create_spatialite, validate_boro_df
+        )
+        con = connect_spatialite()
+    except Exception:
+        raise pytest.skip()
+    else:
+        geom_col = df_nybb.geometry.name
+        df_nybb.geometry.iat[0] = None
+        create_spatialite(con, df_nybb)
+        sql = ('SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, '
+               'AsEWKB("{0}") AS "{0}" FROM nybb').format(geom_col)
+        df = SpatiaLiteSource(con, sql_expr=sql, geopandas_kwargs={
+                              'geom_col': geom_col}).read()
+        validate_boro_df(df)
+    finally:
+        if 'con' in locals():
+            con.close()
+
+# Expected to fail until there is a geopandas release
+# with geopandas/geopandas#856
+@pytest.mark.xfail
+def test_read_spatialite_binary(df_nybb):
+    """Tests that geometry read as binary is accepted."""
+    try:
+        from geopandas.tests.util import (
+            connect_spatialite, create_spatialite, validate_boro_df
+        )
+        con = connect_spatialite()
+    except Exception:
+        raise pytest.skip()
+    else:
+        geom_col = df_nybb.geometry.name
+        create_spatialite(con, df_nybb)
+        sql = ('SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, '
+               'ST_AsBinary("{0}") AS "{0}" FROM nybb').format(geom_col)
+        df = SpatiaLiteSource(con, sql_expr=sql, geopandas_kwargs={
+                              'geom_col': geom_col}).read()
+        validate_boro_df(df)
+    finally:
+        if 'con' in locals():
+            con.close()