Skip to content

Commit

Permalink
Merge pull request #2 from ian-r-rose/multiple-input-sources
Browse files Browse the repository at this point in the history
Multiple input sources
  • Loading branch information
jacobtomlinson authored Apr 11, 2019
2 parents 792eae5 + dc2629e commit 42b31d8
Show file tree
Hide file tree
Showing 9 changed files with 412 additions and 74 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@ intake_geopandas: [Geopandas](http://geopandas.org/) plugin for [Intake](https:/

See [Intake docs](https://intake.readthedocs.io/en/latest/overview.html).

In `intake_geopandas`, there are plugins provided for reading manifests into a geopandas dataframe
- Shape Files
In `intake_geopandas`, there are plugins provided for reading geospatial datasets into a geopandas dataframe.
It currently supports reading from the following data sources:
- GeoJSON files
- PostGIS databases
- ESRI Shapefiles
- Spatialite databases

### Installation

Expand Down
15 changes: 12 additions & 3 deletions intake_geopandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@
__version__ = get_versions()['version']
del get_versions

from .geopandas import ShapeSource

import intake.container
from .geopandas import (
GeoJSONSource,
PostGISSource,
ShapefileSource,
SpatiaLiteSource
)
__all__ = [
'GeoJSONSource',
'PostGISSource',
'ShapefileSource',
'SpatiaLiteSource'
]
2 changes: 1 addition & 1 deletion intake_geopandas/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
if not mo:
# unparseable. Maybe git-describe is misbehaving?
pieces["error"] = (
"unable to parse git-describe output: '%s'" % describe_out)
"unable to parse git-describe output: '%s'" % describe_out)
return pieces

# tag
Expand Down
131 changes: 96 additions & 35 deletions intake_geopandas/geopandas.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,30 @@
# -*- coding: utf-8 -*-
from . import __version__
from abc import ABC, abstractmethod

from intake.source.base import DataSource, Schema
import geopandas

import json
import dask.dataframe as dd
from datetime import datetime, timedelta
from . import __version__


class ShapeSource(DataSource):
"""Shape file intake source"""
name = 'shape'
class GeoPandasSource(DataSource, ABC):
"""
Base class intake source for loading GeoDataFrames.
"""
version = __version__
container = 'dataframe'
partition_access = True

def __init__(self, urlpath, bbox=None, geopandas_kwargs=None, metadata=None):
@abstractmethod
def _open_dataset(self):
"""
Parameters
----------
urlpath : str or iterable, location of data
Either the absolute or relative path to the file or URL to be opened.
Some examples:
- ``{{ CATALOG_DIR }}data/states.shp``
- ``http://some.domain.com/data/dtates.shp``
bbox : tuple | GeoDataFrame or GeoSeries, default None
Filter features by given bounding box, GeoSeries, or GeoDataFrame.
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
geopandas_kwargs : dict
Any further arguments to pass to geopandas's read_file.
Open dataset using geopandas and use pattern fields to set new columns.
"""
self.urlpath = urlpath
self._bbox = bbox
self._geopandas_kwargs = geopandas_kwargs or {}
self._dataframe = None

super(ShapeSource, self).__init__(metadata=metadata)

def _open_dataset(self, urlpath):
"""Open dataset using geopandas and use pattern fields to set new columns
"""
import geopandas

self._dataframe = geopandas.read_file(
urlpath, bbox=self._bbox, **self._geopandas_kwargs)
raise NotImplementedError('GeoPandasSource is an abstract class')

def _get_schema(self):
if self._dataframe is None:
self._open_dataset(self.urlpath)
self._open_dataset()

dtypes = self._dataframe.dtypes.to_dict()
dtypes = {n: str(t) for (n, t) in dtypes.items()}
Expand All @@ -69,3 +47,86 @@ def to_dask(self):

def _close(self):
self._dataframe = None


class GeoPandasFileSource(GeoPandasSource):
def __init__(self, urlpath, bbox=None,
geopandas_kwargs=None, metadata=None):
"""
Parameters
----------
urlpath : str or iterable, location of data
Either the absolute or relative path to the file or URL to be
opened. Some examples:
- ``{{ CATALOG_DIR }}data/states.shp``
- ``http://some.domain.com/data/states.geo.json``
bbox : tuple | GeoDataFrame or GeoSeries, default None
Filter features by given bounding box, GeoSeries, or GeoDataFrame.
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
geopandas_kwargs : dict
Any further arguments to pass to geopandas's read_file function.
"""
self.urlpath = urlpath
self._bbox = bbox
self._geopandas_kwargs = geopandas_kwargs or {}
self._dataframe = None

super().__init__(metadata=metadata)

def _open_dataset(self):
"""
Open dataset using geopandas and use pattern fields to set new columns.
"""
self._dataframe = geopandas.read_file(
self.urlpath, bbox=self._bbox, **self._geopandas_kwargs)


class GeoJSONSource(GeoPandasFileSource):
name = "geojson"


class ShapefileSource(GeoPandasFileSource):
name = "shapefile"


class GeoPandasSQLSource(GeoPandasSource):
def __init__(self, uri, sql_expr=None, table=None,
geopandas_kwargs=None, metadata=None):
"""
Parameters
----------
uri : str
The connection string for the PostGIS database.
sql_expr: str, optional
The SQL expression used to load from the database.
Must include either `sql_expr` or `table`.
table: str, optional
The table to load from the database.
This is ignored if `sql_expr` is provided.
geopandas_kwargs : dict
Any further arguments to pass to geopandas's read_postgis function.
"""
self.uri = uri
if sql_expr:
self.sql_expr = sql_expr
elif table:
self.sql_expr = f"SELECT * FROM {table}"
else:
raise ValueError("Must provide either a sql_expr or a table")

self._geopandas_kwargs = geopandas_kwargs or {}
self._dataframe = None

super().__init__(metadata=metadata)

def _open_dataset(self):
self._dataframe = geopandas.read_postgis(
self.sql_expr, self.uri, **self._geopandas_kwargs)


class PostGISSource(GeoPandasSQLSource):
name = "postgis"


class SpatiaLiteSource(GeoPandasSQLSource):
name = "spatialite"
5 changes: 1 addition & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
intake
dask
requests
pandas
scipy
geopandas
pytest
182 changes: 182 additions & 0 deletions tests/data/countries.geo.json

Large diffs are not rendered by default.

44 changes: 44 additions & 0 deletions tests/test_file_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
import os
import pytest

from intake_geopandas import GeoJSONSource, ShapefileSource


@pytest.fixture
def shape_filenames():
basedir = os.path.dirname(__file__)
return dict(
stations=os.path.join(basedir, 'data', 'stations', 'stations.shp')
)


@pytest.fixture
def shape_datasource(shape_filenames):
return ShapefileSource(shape_filenames['stations'])


@pytest.fixture
def geojson_filenames():
basedir = os.path.dirname(__file__)
return dict(countries=os.path.join(basedir, 'data', 'countries.geo.json'))


@pytest.fixture
def geojson_datasource(geojson_filenames):
return GeoJSONSource(geojson_filenames['countries'])


def test_shape_datasource(shape_datasource):
info = shape_datasource.discover()

assert info['dtype'] == {'name': 'object',
'marker-col': 'object',
'marker-sym': 'object',
'line': 'object',
'geometry': 'object'}


def test_countries_datasource(geojson_datasource):
info = geojson_datasource.discover()
print(info)
29 changes: 0 additions & 29 deletions tests/test_source.py

This file was deleted.

70 changes: 70 additions & 0 deletions tests/test_sql_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
Tests for loading data from SQL data sources.
Modified from those in the GeoPandas test suite.
In order to run, SpatiaLite must be installed and configured.
"""
import pytest

import geopandas
from geopandas import read_file

from intake_geopandas import SpatiaLiteSource



@pytest.fixture
def df_nybb():
nybb_path = geopandas.datasets.get_path('nybb')
df = read_file(nybb_path)
return df


# Expected to fail until there is a geopandas release
# with geopandas/geopandas#856
@pytest.mark.xfail
def test_read_spatialite_null_geom(df_nybb):
"""Tests that geometry with NULL is accepted."""
try:
from geopandas.tests.util import (
connect_spatialite, create_spatialite, validate_boro_df
)
con = connect_spatialite()
except Exception:
raise pytest.skip()
else:
geom_col = df_nybb.geometry.name
df_nybb.geometry.iat[0] = None
create_spatialite(con, df_nybb)
sql = ('SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, '
'AsEWKB("{0}") AS "{0}" FROM nybb').format(geom_col)
df = SpatiaLiteSource(con, sql_expr=sql, geopandas_kwargs={
'geom_col': geom_col}).read()
validate_boro_df(df)
finally:
if 'con' in locals():
con.close()

# Expected to fail until there is a geopandas release
# with geopandas/geopandas#856
@pytest.mark.xfail
def test_read_spatialite_binary(df_nybb):
"""Tests that geometry read as binary is accepted."""
try:
from geopandas.tests.util import (
connect_spatialite, create_spatialite, validate_boro_df
)
con = connect_spatialite()
except Exception:
raise pytest.skip()
else:
geom_col = df_nybb.geometry.name
create_spatialite(con, df_nybb)
sql = ('SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, '
'ST_AsBinary("{0}") AS "{0}" FROM nybb').format(geom_col)
df = SpatiaLiteSource(con, sql_expr=sql, geopandas_kwargs={
'geom_col': geom_col}).read()
validate_boro_df(df)
finally:
if 'con' in locals():
con.close()

0 comments on commit 42b31d8

Please sign in to comment.