Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pyogrio io #583

Merged
merged 12 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ Model
Vector
^^^^^^
- ``vector.GeoDataset.from_gdf`` can use the gdf columns as data_vars instead of external xarray. (PR #412)
- ``io.open_vector`` now can use `pyogrio` if reading from a non-tabular dataset (PR #583)

Fixed
-----
Expand Down
50 changes: 50 additions & 0 deletions hydromt/gis_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
import glob
import logging
import os
from io import IOBase
from os.path import dirname
from typing import Optional, Tuple

import geopandas as gpd
import numpy as np
import xarray as xr
from pyflwdir import gis_utils as gis
from pyogrio import read_info
from pyproj import CRS
from pyproj.transformer import Transformer
from rasterio.transform import Affine
Expand Down Expand Up @@ -65,6 +67,9 @@
}
GDAL_EXT_CODE_MAP = {v: k for k, v in GDAL_DRIVER_CODE_MAP.items()}

GPD_TYPES = gpd.GeoDataFrame | gpd.GeoSeries
GEOM_TYPES = GPD_TYPES | BaseGeometry
Jaapel marked this conversation as resolved.
Show resolved Hide resolved

## GEOM functions


Expand Down Expand Up @@ -553,3 +558,48 @@ def to_geographic_bbox(bbox, source_crs):
bbox = Transformer.from_crs(source_crs, target_crs).transform_bounds(*bbox)

return bbox


def bbox_from_file_and_filters(
fn: IOBase,
bbox: GEOM_TYPES | None = None,
mask: GEOM_TYPES | None = None,
crs: CRS | None = None,
) -> Tuple[float, float, float, float] | None:
"""Create a bbox from the file metadata and filter options.

Pyogrio does not accept a mask, and requires a bbox in the same CRS as the data.
This function takes the possible bbox filter, mask filter and crs of the input data
and returns a bbox in the same crs as the data based on the input filters.
As pyogrio currently does not support filtering using a mask, the mask is converted
to a bbox and the bbox is returned so that the data has some geospatial filtering.

Parameters
----------
fn: IOBase,
opened file
bbox: GeoDataFrame | GeoSeries | BaseGeometry
bounding box to filter the data while reading
mask: GeoDataFrame | GeoSeries | BaseGeometry
mask to filter the data while reading
crs: pyproj.CRS
coordinate reference system of the bounding box or geometry. If already set,
this argument is ignored.
"""
if bbox is not None and mask is not None:
raise ValueError(
"Both 'bbox' and 'mask' are provided. Please provide only one."
)
if bbox is None and mask is None:
return None
savente93 marked this conversation as resolved.
Show resolved Hide resolved
source_crs = CRS(read_info(fn).get("crs", "EPSG:4326")) # assume WGS84

if mask is not None:
bbox = mask
Jaapel marked this conversation as resolved.
Show resolved Hide resolved

# convert bbox to geom with input crs (assume WGS84 if not provided)
crs = crs if crs is not None else CRS.from_user_input(4326)
if issubclass(type(bbox), BaseGeometry):
bbox = gpd.GeoSeries(bbox, crs=crs)
bbox = bbox if bbox.crs is not None else bbox.set_crs(crs)
return tuple(bbox.to_crs(source_crs).total_bounds)
29 changes: 21 additions & 8 deletions hydromt/io.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
"""Implementations for all of the necessary IO for HydroMT."""
"""Implementations for all of pythe necessary IO for HydroMT."""
import glob
import io
import io as pyio
import logging
from os.path import abspath, basename, dirname, isfile, join, splitext
from pathlib import Path
from typing import Any, Dict, Literal, Optional, Union

import dask
import fsspec
import geopandas as gpd
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -68,7 +69,7 @@ def open_raster(
kwargs.update(masked=mask_nodata, default_name="data", chunks=chunks)
if not mask_nodata: # if mask_and_scale by default True in xarray ?
kwargs.update(mask_and_scale=False)
if isinstance(filename, io.IOBase): # file-like does not handle chunks
if isinstance(filename, pyio.IOBase): # file-like does not handle chunks
logger.warning("Removing chunks to read and load remote data.")
kwargs.pop("chunks")
# keep only 2D DataArray
Expand Down Expand Up @@ -576,7 +577,7 @@ def open_vector(

Parameters
----------
fn : str
fn: str or Path-like,
path to geometry file
driver: {'csv', 'xls', 'xy', 'vector', 'parquet'}, optional
driver used to read the file: :py:meth:`geopandas.open_file` for gdal vector
Expand Down Expand Up @@ -615,13 +616,25 @@ def open_vector(
gdf : geopandas.GeoDataFrame
Parsed geometry file
"""
filtered = False

def _read(f: pyio.IOBase) -> gpd.GeoDataFrame:
bbox_reader = gis_utils.bbox_from_file_and_filters(f, bbox, geom, crs)
f.seek(0)
savente93 marked this conversation as resolved.
Show resolved Hide resolved
return gpd.read_file(f, bbox=bbox_reader, mode=mode, **kwargs)

driver = driver if driver is not None else str(fn).split(".")[-1].lower()
if driver in ["csv", "parquet", "xls", "xlsx", "xy"]:
gdf = open_vector_from_table(fn, driver=driver, **kwargs)
else:
gdf = gpd.read_file(fn, bbox=bbox, mask=geom, mode=mode, **kwargs)
filtered = predicate == "intersects"
# check if pathlike
if all(
map(lambda method: hasattr(fn, method), ("seek", "close", "read", "write"))
):
Jaapel marked this conversation as resolved.
Show resolved Hide resolved
with fn.open(mode="rb") as f:
gdf = _read(f)
else:
with fsspec.open(fn, mode="rb") as f: # lose storage options here
gdf = _read(f)

# check geometry type
if assert_gtype is not None:
Expand All @@ -642,7 +655,7 @@ def open_vector(
if dst_crs is not None:
gdf = gdf.to_crs(dst_crs)
# filter points
if gdf.index.size > 0 and not filtered and (geom is not None or bbox is not None):
if gdf.index.size > 0 and (geom is not None or bbox is not None):
idx = gis_utils.filter_gdf(gdf, geom=geom, bbox=bbox, predicate=predicate)
gdf = gdf.iloc[idx, :]
return gdf
Expand Down
15 changes: 8 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,13 @@ dynamic = ['version', 'description']

[project.optional-dependencies]
io = [
"fastparquet", # parquet IO
"gcsfs", # google cloud file system
"openpyxl", # excel IO
"pillow", # image IO
"rio-vrt", # write VRT files
"s3fs", # S3 file system
"gcsfs", # google cloud file system
"openpyxl", # excel IO
"pyogrio>=0.6", # io for geopandas dataframes
"fastparquet", # parquet IO
"pillow", # image IO
"rio-vrt", # write VRT files
"s3fs", # S3 file system
]
extra = [
"matplotlib", # plotting; required for slippy tiles
Expand Down Expand Up @@ -127,7 +128,7 @@ ignore = ["D211", "D213", "E741", "D105", "E712", "B904"]
exclude = ["docs"]

[tool.ruff.per-file-ignores]
"tests/**" = ["D103", "D100", "D104"]
"tests/**" = ["D100", "D101", "D102", "D103", "D104"]
"hydromt/__init__.py" = ["E402", "F401", "F403"]
"hydromt/workflows/__init__.py" = ["F403"]
"hydromt/stats/__init__.py" = ["F403"]
Expand Down
14 changes: 11 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from os.path import abspath, dirname, join
from pathlib import Path
from tempfile import TemporaryDirectory

import geopandas as gpd
import numpy as np
Expand Down Expand Up @@ -33,6 +35,12 @@
DATADIR = join(dirname(abspath(__file__)), "data")


@pytest.fixture(scope="class")
def tmp_dir() -> Path:
with TemporaryDirectory() as tempdirname:
yield Path(tempdirname)


@pytest.fixture()
def rioda():
return raster.full_from_transform(
Expand All @@ -56,7 +64,7 @@ def rioda_large():
return da


@pytest.fixture()
@pytest.fixture(scope="class")
Jaapel marked this conversation as resolved.
Show resolved Hide resolved
def df():
df = (
pd.DataFrame(
Expand Down Expand Up @@ -119,7 +127,7 @@ def df_time():
return df_time


@pytest.fixture()
@pytest.fixture(scope="class")
def geodf(df):
gdf = gpd.GeoDataFrame(
data=df.copy().drop(columns=["longitude", "latitude"]),
Expand All @@ -129,7 +137,7 @@ def geodf(df):
return gdf


@pytest.fixture()
@pytest.fixture(scope="session")
def world():
world = gpd.read_file(join(DATADIR, "naturalearth_lowres.geojson"))
return world
Expand Down
87 changes: 87 additions & 0 deletions tests/test_gis_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
"""Test for hydromt.gu submodule."""

import os
from pathlib import Path

import geopandas as gpd
import numpy as np
import pytest
import xarray as xr
from affine import Affine
from pyproj import CRS
from rasterio.transform import from_origin
from shapely import Polygon, box

from hydromt import gis_utils as gu
from hydromt.io import open_raster
Expand Down Expand Up @@ -127,3 +131,86 @@ def test_create_vrt(tmpdir, rioda_large):
gu.create_vrt(vrt_fn)
with pytest.raises(IOError, match="No files found at "):
gu.create_vrt(vrt_fn, files_path=os.path.join(path, "dummy_xyz", "*.abc"))


class TestBBoxFromFileAndFilters:
@pytest.fixture(scope="class")
def vector_data_with_crs(self, geodf: gpd.GeoDataFrame, tmp_dir: Path) -> Path:
example_data = geodf.set_crs(crs=CRS.from_user_input(4326))
example_data.to_crs(crs=CRS.from_user_input(3857), inplace=True)
path = tmp_dir / "test.fgb"
example_data.to_file(path, engine="pyogrio")
return path

@pytest.fixture(scope="class")
def vector_data_without_crs(self, geodf: gpd.GeoDataFrame, tmp_dir: Path) -> Path:
path = tmp_dir / "test.geojson"
geodf.to_file(path, engine="pyogrio")
return path

@pytest.fixture(scope="class")
def gdf_mask_without_crs(self, world: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
return world[world["name"] == "Chile"]

@pytest.fixture(scope="class")
def gdf_bbox_with_crs(
self, gdf_mask_without_crs: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
return gdf_mask_without_crs.set_crs(CRS.from_user_input(4326))

@pytest.fixture(scope="class")
def shapely_bbox(self, gdf_mask_without_crs: gpd.GeoDataFrame) -> Polygon:
return box(*list(gdf_mask_without_crs.total_bounds))

def test_gdf_bbox_crs_source_crs(
self, gdf_bbox_with_crs: gpd.GeoDataFrame, vector_data_with_crs: Path
):
bbox = gu.bbox_from_file_and_filters(
vector_data_with_crs, bbox=gdf_bbox_with_crs
)
# assert converted to CRS of source data EPSG:3857
assert all(map(lambda x: abs(x) > 180, bbox))

def test_gdf_mask_no_crs_source_crs(
self, gdf_mask_without_crs: gpd.GeoDataFrame, vector_data_with_crs: Path
):
bbox = gu.bbox_from_file_and_filters(
vector_data_with_crs, bbox=gdf_mask_without_crs
)
# assert converted to CRS of source data EPSG:3857
assert all(map(lambda x: abs(x) > 180, bbox))

def test_gdf_mask_crs_source_no_crs(
self, gdf_mask_without_crs: gpd.GeoDataFrame, vector_data_without_crs: Path
):
bbox = gu.bbox_from_file_and_filters(
vector_data_without_crs, bbox=gdf_mask_without_crs
)
assert all(map(lambda x: abs(x) < 180, bbox))

def test_gdf_mask_no_crs_source_no_crs(
self, gdf_mask_without_crs: gpd.GeoDataFrame, vector_data_without_crs: Path
):
bbox = gu.bbox_from_file_and_filters(
vector_data_without_crs, bbox=gdf_mask_without_crs, crs=4326
)
assert all(map(lambda x: abs(x) < 180, bbox))

def test_shapely_input(self, shapely_bbox: Polygon, vector_data_with_crs: Path):
bbox = gu.bbox_from_file_and_filters(vector_data_with_crs, bbox=shapely_bbox)
assert all(map(lambda x: abs(x) > 180, bbox))

def test_does_not_filter(self, vector_data_with_crs: Path):
bbox = gu.bbox_from_file_and_filters(vector_data_with_crs)
assert bbox is None

def test_raises_valueerror(
self, vector_data_with_crs: Path, gdf_bbox_with_crs: gpd.GeoDataFrame
):
with pytest.raises(
ValueError,
match="Both 'bbox' and 'mask' are provided. Please provide only one.",
):
gu.bbox_from_file_and_filters(
vector_data_with_crs, bbox=gdf_bbox_with_crs, mask=gdf_bbox_with_crs
)
5 changes: 4 additions & 1 deletion tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from os.path import join
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
import pytest
Expand All @@ -16,7 +17,9 @@
from hydromt import _compat, raster


def test_open_vector(tmpdir, df, geodf, world):
@pytest.mark.parametrize("engine", ["fiona", "pyogrio"])
def test_open_vector(engine, tmpdir, df, geodf, world):
gpd.io_engine = engine
fn_csv = str(tmpdir.join("test.csv"))
fn_parquet = str(tmpdir.join("test.parquet"))
fn_xy = str(tmpdir.join("test.xy"))
Expand Down