From 14b020c770c31420902ebb5291f103950e90ba1a Mon Sep 17 00:00:00 2001 From: Wei Ji Date: Thu, 20 Aug 2020 12:58:23 +1200 Subject: [PATCH] :sparkles: GPU accelerated point in polygon using cuspatial A very fast way to find points inside polygons! This is really just a convenience function that wraps around `cuspatial.point_in_polygon`, hiding all sorts of boilerplate. Specifically, this handles: 1. Converting a geopandas geodataframe into a cuspatial friendly format, see https://github.com/rapidsai/cuspatial/issues/165 2. Hacky workaround the 31 polygon limit using a for-loop, based on https://github.com/rapidsai/cuspatial/blob/branch-0.15/notebooks/nyc_taxi_years_correlation.ipynb 3. Outputting actual string labels from the geodataframe, instead of non human readable index numbers Also added tests for this in test_spatiotemporal_gpu.py, though it won't work on the CI, only locally where a GPU is available. --- deepicedrain/__init__.py | 12 ++- deepicedrain/spatiotemporal.py | 90 ++++++++++++++++++- .../tests/test_spatiotemporal_conversions.py | 2 +- deepicedrain/tests/test_spatiotemporal_gpu.py | 36 ++++++++ 4 files changed, 134 insertions(+), 6 deletions(-) create mode 100644 deepicedrain/tests/test_spatiotemporal_gpu.py diff --git a/deepicedrain/__init__.py b/deepicedrain/__init__.py index 60c15a4..c710f41 100644 --- a/deepicedrain/__init__.py +++ b/deepicedrain/__init__.py @@ -1,12 +1,16 @@ import importlib.resources import logging -import intake - import deepicedrain -from deepicedrain.deltamath import calculate_delta, nanptp, nan_linregress +import intake +from deepicedrain.deltamath import calculate_delta, nan_linregress, nanptp from deepicedrain.extraload import array_to_dataframe -from deepicedrain.spatiotemporal import Region, deltatime_to_utctime, lonlat_to_xy +from deepicedrain.spatiotemporal import ( + Region, + deltatime_to_utctime, + lonlat_to_xy, + point_in_polygon_gpu, +) __version__: str = "0.2.1" diff --git a/deepicedrain/spatiotemporal.py b/deepicedrain/spatiotemporal.py index fe9d4a8..c1bd3a7 100644 --- a/deepicedrain/spatiotemporal.py +++ b/deepicedrain/spatiotemporal.py @@ -4,13 +4,17 @@ """ import dataclasses import datetime +import os +import tempfile -import datashader +import geopandas as gpd import numpy as np import pandas as pd import pyproj import xarray as xr +import datashader + @dataclasses.dataclass(frozen=True) class Region: @@ -140,3 +144,87 @@ def lonlat_to_xy( ) else: return x, y + + +def point_in_polygon_gpu( + points_df, # cudf.DataFrame with x and y columns of point coordinates + poly_df: gpd.GeoDataFrame, # geopandas.GeoDataFrame with polygon shapes + points_x_col: str = "x", + points_y_col: str = "y", + poly_label_col: str = None, +): + """ + Find polygon labels for each of the input points. + This is a GPU accelerated version that requires cuspatial! + + Parameters + ---------- + points_df : cudf.DataFrame + A dataframe in GPU memory containing the x and y coordinates. + points_x_col : str + Name of the x coordinate column in points_df. Default is "x". + points_y_col : str + Name of the y coordinate column in points_df. Default is "y". + + poly_df : geopandas.GeoDataFrame + A geodataframe in CPU memory containing polygons geometries in each + row. + poly_label_col : str + Name of the column in poly_df that will be used to label the points, + e.g. "placename". Default is to automatically use the first column + unless otherwise specified. + + Returns + ------- + point_labels : cudf.Series + A column of labels that indicates which polygon the points fall into. + + """ + import cudf + import cuspatial + + poly_df_: gpd.GeoDataFrame = poly_df.reset_index() + if poly_label_col is None: + # Simply use first column of geodataframe as label if not provided + poly_label_col: str = poly_df.columns[0] + point_labels: cudf.Series = cudf.Series(index=points_df.index).astype( + poly_df[poly_label_col].dtype + ) + + # Load CPU-based GeoDataFrame into a GPU-based cuspatial friendly format + # This is a workaround until the related feature request at + # https://github.com/rapidsai/cuspatial/issues/165 is implemented + with tempfile.TemporaryDirectory() as tmpdir: + # Save geodataframe to a temporary shapefile, + # so that we can load it into GPU memory using cuspatial + tmpshpfile = os.path.join(tmpdir, "poly_df.shp") + poly_df_.to_file(filename=tmpshpfile, driver="ESRI Shapefile") + + # Load polygon_offsets, ring_offsets and polygon xy points + # from temporary shapefile into GPU memory + poly_offsets, poly_ring_offsets, poly_points = cuspatial.read_polygon_shapefile( + filename=tmpshpfile + ) + + # Run the actual point in polygon algorithm! + # Note that cuspatial's point_in_polygon function has a 31 polygon limit, + # hence the for-loop code below. See also + # https://github.com/rapidsai/cuspatial/blob/branch-0.15/notebooks/nyc_taxi_years_correlation.ipynb + num_poly: int = len(poly_df_) + point_in_poly_iter: list = list(np.arange(0, num_poly, 31)) + [num_poly] + for i in range(len(point_in_poly_iter) - 1): + start, end = point_in_poly_iter[i], point_in_poly_iter[i + 1] + poly_labels: cudf.DataFrame = cuspatial.point_in_polygon( + test_points_x=points_df[points_x_col], + test_points_y=points_df[points_y_col], + poly_offsets=poly_offsets[start:end], + poly_ring_offsets=poly_ring_offsets, + poly_points_x=poly_points.x, + poly_points_y=poly_points.y, + ) + + # Label each point with polygon they fall in + for label in poly_labels.columns: + point_labels.loc[poly_labels[label]] = poly_df_.loc[label][poly_label_col] + + return point_labels diff --git a/deepicedrain/tests/test_spatiotemporal_conversions.py b/deepicedrain/tests/test_spatiotemporal_conversions.py index 7d1cace..c844f01 100644 --- a/deepicedrain/tests/test_spatiotemporal_conversions.py +++ b/deepicedrain/tests/test_spatiotemporal_conversions.py @@ -3,12 +3,12 @@ """ import datetime -import dask import numpy as np import numpy.testing as npt import pandas as pd import xarray as xr +import dask from deepicedrain import catalog, deltatime_to_utctime, lonlat_to_xy diff --git a/deepicedrain/tests/test_spatiotemporal_gpu.py b/deepicedrain/tests/test_spatiotemporal_gpu.py new file mode 100644 index 0000000..696d8c6 --- /dev/null +++ b/deepicedrain/tests/test_spatiotemporal_gpu.py @@ -0,0 +1,36 @@ +""" +Tests GPU accelerated spatial algorithms +""" + +import geopandas as gpd +import numpy as np +import pytest +import shapely.geometry + +from deepicedrain import point_in_polygon_gpu + +cudf = pytest.importorskip(modname="cudf") + + +def test_point_in_polygon_gpu(): + """ + Tests that the Point in Polygon GPU algorithm works + """ + points_df: cudf.DataFrame = cudf.DataFrame( + data={ + "x": np.linspace(start=-200, stop=200, num=50), + "y": np.linspace(start=-160, stop=160, num=50), + } + ) + polygon = { + "placename": ["South Pole"], + "geometry": shapely.geometry.box(minx=-5, maxx=5, miny=-5, maxy=5).buffer(100), + } + poly_df: gpd.GeoDataFrame = gpd.GeoDataFrame(polygon) + + point_labels = point_in_polygon_gpu( + points_df=points_df, poly_df=poly_df, poly_label_col="placename" + ) + assert isinstance(point_labels, cudf.Series) + assert point_labels.count() == 20 # Count non-NaN labels + assert list(point_labels.unique().to_pandas()) == [None, "South Pole"]