Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better support for binary predicates with large inputs. #1166

Merged
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3e9e10b
Add files that support binary_predicates.ipynb PR.
thomcom Jun 1, 2023
3332ab9
Add large binpred tests.
thomcom Jun 1, 2023
fa9bbe6
Fix two indexing issues.
thomcom Jun 1, 2023
2117883
Fix issue with polygon.contains(linestring) not being generic enough.
thomcom Jun 1, 2023
93b3b92
Fix issue with linestring.geom_equals(linestring) having bad input_ty…
thomcom Jun 1, 2023
3af37c3
Rest of bugfixes for large datasets.
thomcom Jun 5, 2023
6dc2169
Add test_feature_groups which tests via binpred dispatch but runs eac…
thomcom Jun 5, 2023
875ad9f
Merge branch 'branch-23.08' into bug/binpred-fail-with-large-datasets
thomcom Jun 5, 2023
46cdb8c
Fix test import statement.
thomcom Jun 5, 2023
7f950dd
Clean up contains changes and comments.
thomcom Jun 5, 2023
06e8f06
Merge branch 'branch-23.08' into bug/binpred-fail-with-large-datasets
thomcom Jun 5, 2023
60391dd
Merge branch 'branch-23.08' into bug/binpred-fail-with-large-datasets
thomcom Jun 6, 2023
217a99a
Implement a couple of review suggestions and drop large tests for car…
thomcom Jun 6, 2023
bf77069
Update python/cuspatial/cuspatial/core/binpreds/contains.py
thomcom Jun 7, 2023
c8cb7ea
Update python/cuspatial/cuspatial/core/binpreds/contains.py
thomcom Jun 7, 2023
2bf49e3
Update python/cuspatial/cuspatial/core/binpreds/contains.py
thomcom Jun 7, 2023
285b0c7
Update python/cuspatial/cuspatial/core/binpreds/feature_contains_prop…
thomcom Jun 7, 2023
7889603
Update python/cuspatial/cuspatial/core/binpreds/feature_disjoint.py
thomcom Jun 7, 2023
a28ec99
Update python/cuspatial/cuspatial/core/binpreds/feature_disjoint.py
thomcom Jun 7, 2023
a94c232
Update python/cuspatial/cuspatial/core/binpreds/feature_intersects.py
thomcom Jun 7, 2023
8e38e8e
Update python/cuspatial/cuspatial/core/binpreds/feature_intersects.py
thomcom Jun 7, 2023
6e9884f
Update python/cuspatial/cuspatial/core/binpreds/feature_intersects.py
thomcom Jun 7, 2023
d0d1888
Update python/cuspatial/cuspatial/core/binpreds/feature_intersects.py
thomcom Jun 7, 2023
aa49915
Optimize result processing for brute_force and pairwise.
thomcom Jun 7, 2023
6d1bd1b
Drop unused function.
thomcom Jun 7, 2023
dce140a
Merge branch 'branch-23.08' into bug/binpred-fail-with-large-datasets
thomcom Jun 7, 2023
29abbbf
Flake8 issues unfixed due to github commits.
thomcom Jun 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 64 additions & 3 deletions python/cuspatial/cuspatial/core/binpreds/contains.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@

from math import ceil, sqrt

import cudf
from cudf import DataFrame, Series
from cudf.core.column import as_column

import cuspatial
from cuspatial._lib.pairwise_point_in_polygon import (
pairwise_point_in_polygon as cpp_pairwise_point_in_polygon,
)
from cuspatial._lib.point_in_polygon import (
point_in_polygon as cpp_byte_point_in_polygon,
)
Expand Down Expand Up @@ -35,7 +39,7 @@ def _quadtree_contains_properly(points, polygons):
within its corresponding polygon.
"""

scale = -1
# Set the scale to the default minimum scale without triggering a warning.
max_depth = 15
min_size = ceil(sqrt(len(points)))
if len(polygons) == 0:
Expand All @@ -44,6 +48,7 @@ def _quadtree_contains_properly(points, polygons):
x_min = polygons.polygons.x.min()
y_max = polygons.polygons.y.max()
y_min = polygons.polygons.y.min()
scale = max(x_max - x_min, y_max - y_min) / ((1 << max_depth) + 2)
point_indices, quadtree = cuspatial.quadtree_on_points(
points,
x_min,
Expand Down Expand Up @@ -115,9 +120,65 @@ def _brute_force_contains_properly(points, polygons):
return final_result


def contains_properly(polygons, points, quadtree=True):
if quadtree:
def _pairwise_contains_properly(points, polygons):
"""Compute from a series of points and a series of polygons which points
thomcom marked this conversation as resolved.
Show resolved Hide resolved
are properly contained within the corresponding polygon. Polygon A contains
Point B properly if B intersects the interior of A but not the boundary (or
exterior).

Note that polygons must be closed: the first and last vertex of each
polygon must be the same.

This version provides support for a very large number of points with
an equal number of polygons.
thomcom marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
points : GeoSeries
A GeoSeries of points.
polygons : GeoSeries
A GeoSeries of polygons.

Returns
-------
result : cudf.DataFrame
A DataFrame of boolean values indicating whether each point falls
within its corresponding polygon.
"""
pip_result = cpp_pairwise_point_in_polygon(
as_column(points.points.x),
as_column(points.points.y),
as_column(polygons.polygons.part_offset),
as_column(polygons.polygons.ring_offset),
as_column(polygons.polygons.x),
as_column(polygons.polygons.y),
)
# Pairwise returns a boolean column where the point and polygon index
# always correspond. We can use this to create a dataframe with the
# same shape as the quadtree result. Finally all the False results
# are dropped, as quadtree doesn't report False results.
thomcom marked this conversation as resolved.
Show resolved Hide resolved
quadtree_shaped_result = (
thomcom marked this conversation as resolved.
Show resolved Hide resolved
cudf.Series(pip_result).reset_index().reset_index()
thomcom marked this conversation as resolved.
Show resolved Hide resolved
)
quadtree_shaped_result.columns = [
"pairwise_index",
"point_index",
"result",
]
result = quadtree_shaped_result[["point_index", "pairwise_index"]][
quadtree_shaped_result["result"].astype("bool")
]
result = result.sort_values(["point_index", "pairwise_index"]).reset_index(
drop=True
)
thomcom marked this conversation as resolved.
Show resolved Hide resolved
thomcom marked this conversation as resolved.
Show resolved Hide resolved
return result


def contains_properly(polygons, points, mode="pairwise"):
if mode == "quadtree":
return _quadtree_contains_properly(points, polygons)
elif mode == "pairwise":
return _pairwise_contains_properly(points, polygons)
else:
# Use stack to convert the result to the same shape as quadtree's
# result, name the columns appropriately, and return the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ def _preprocess_multipoint_rhs(self, lhs, rhs):
if contains_only_linestrings(rhs):
# condition for linestrings
geom = rhs.lines
elif contains_only_polygons(rhs) is True:
elif contains_only_polygons(rhs):
# polygon in polygon
geom = rhs.polygons
elif contains_only_multipoints(rhs) is True:
elif contains_only_multipoints(rhs):
# mpoint in polygon
geom = rhs.multipoints
else:
Expand Down Expand Up @@ -150,6 +150,7 @@ def _reindex_allpairs(self, lhs, op_result) -> DataFrame:
# once their index is converted to a polygon index.
allpairs_result = polygon_indices.drop_duplicates()

# TODO: This is slow and needs optimization
# Replace the polygon index with the original index
allpairs_result["polygon_index"] = allpairs_result[
"polygon_index"
Expand Down Expand Up @@ -212,7 +213,6 @@ def _postprocess_multipoint_rhs(
result_df = hits.reset_index().merge(
expected_count.reset_index(), on="rhs_index"
)

# Handling for the basic predicates
if mode == "basic_none":
none_result = _true_series(len(rhs))
Expand Down
62 changes: 39 additions & 23 deletions python/cuspatial/cuspatial/core/binpreds/feature_contains.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,35 +81,51 @@ def _compute_polygon_polygon_contains(self, lhs, rhs, preprocessor_result):
polygon_size_reduction = rhs.polygons.part_offset.take(
rhs.polygons.geometry_offset[1:]
) - rhs.polygons.part_offset.take(rhs.polygons.geometry_offset[:-1])
thomcom marked this conversation as resolved.
Show resolved Hide resolved
return contains + intersects >= rhs.sizes - polygon_size_reduction
result = contains + intersects >= rhs.sizes - polygon_size_reduction
return result

def _test_interior(self, lhs, rhs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wish there's a more intuitive name, what does this function do?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is addressed in #1186, wait till this is merged and review it and this function is removed. :)

# We only need to test linestrings that are length 2.
# Divide the linestring in half and test the point for containment
# in the polygon.
size_two = rhs.sizes == 2
if (size_two).any():
center_points = _linestrings_to_center_point(rhs[size_two])
size_two_results = _false_series(len(lhs))
size_two_results.iloc[rhs.index[size_two]] = (
_basic_contains_count(lhs, center_points) > 0
)
return size_two_results
else:
return _false_series(len(lhs))

def _compute_polygon_linestring_contains(
self, lhs, rhs, preprocessor_result
):
contains = _basic_contains_count(lhs, rhs).reset_index(drop=True)
intersects = self._intersection_results_for_contains(lhs, rhs)
if (contains == 0).all() and (intersects != 0).all():
# The hardest case. We need to check if the linestring is
# contained in the boundary of the polygon, the interior,
# or the exterior.
# We only need to test linestrings that are length 2.
# Divide the linestring in half and test the point for containment
# in the polygon.

if (rhs.sizes == 2).any():
center_points = _linestrings_to_center_point(
rhs[rhs.sizes == 2]
)
size_two_results = _false_series(len(lhs))
size_two_results[rhs.sizes == 2] = (
_basic_contains_count(lhs, center_points) > 0
)
return size_two_results
else:
line_intersections = _false_series(len(lhs))
line_intersections[intersects == rhs.sizes] = True
return line_intersections
return contains + intersects >= rhs.sizes

# If a linestring has intersection but not containment, we need to
# test if the linestring is in the interior of the polygon.
final_result = _false_series(len(lhs))
intersection_with_no_containment = (contains == 0) & (intersects != 0)
interior_tests = self._test_interior(
lhs[intersection_with_no_containment].reset_index(drop=True),
rhs[intersection_with_no_containment].reset_index(drop=True),
)
interior_tests.index = intersection_with_no_containment[
intersection_with_no_containment
].index
# LineStrings that have intersection but no containment are set
# according to the `intersection_with_no_containment` mask.
final_result[intersection_with_no_containment] = interior_tests
# LineStrings that do not are contained if the sum of intersecting
# and containing points is greater than or equal to the number of
# points that make up the linestring.
Comment on lines +125 to +127
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am having trouble parsing this comment (and all of the code in this function). Can it be made clearer?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have written an improvement to this function in #1186, please review when this is merged?

final_result[~intersection_with_no_containment] = (
contains + intersects >= rhs.sizes
)
return final_result

def _compute_predicate(self, lhs, rhs, preprocessor_result):
if contains_only_points(rhs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

from typing import TypeVar

import cupy as cp

import cudf

from cuspatial.core.binpreds.basic_predicates import (
_basic_equals_all,
_basic_intersects,
Expand Down Expand Up @@ -60,7 +64,7 @@ def _preprocess(self, lhs, rhs):
preprocessor_result = super()._preprocess_multipoint_rhs(lhs, rhs)
return self._compute_predicate(lhs, rhs, preprocessor_result)

def _should_use_quadtree(self, lhs):
def _pip_mode(self, lhs, rhs):
"""Determine if the quadtree should be used for the binary predicate.

Returns
Expand All @@ -70,18 +74,21 @@ def _should_use_quadtree(self, lhs):

Notes
-----
1. Quadtree is always used if user requests `allpairs=True`.
2. If the number of polygons in the lhs is less than 32, we use the
1. If the number of polygons in the lhs is less than 32, we use the
brute-force algorithm because it is faster and has less memory
overhead.
3. If the lhs contains more than 32 polygons, we use the quadtree
because it does not have a polygon-count limit.
4. If the lhs contains multipolygons, we use quadtree because the
performance between quadtree and brute-force is similar, but
code complexity would be higher if we did multipolygon
reconstruction on both code paths.
2. If the lhs contains multipolygons, or `allpairs=True` is specified,
we use quadtree because the quadtree code path already handles
multipolygons.
3. Otherwise pairwise is defaulted to since the default GeoPandas
behavior is to use the pairwise algorithm.
thomcom marked this conversation as resolved.
Show resolved Hide resolved
"""
return len(lhs) >= 32 or has_multipolygons(lhs) or self.config.allpairs
if len(lhs) <= 31:
return "brute_force"
elif self.config.allpairs or has_multipolygons(lhs):
return "quadtree"
else:
return "pairwise"

def _compute_predicate(
self,
Expand All @@ -97,10 +104,30 @@ def _compute_predicate(
raise TypeError(
"`.contains` can only be called with polygon series."
)
use_quadtree = self._should_use_quadtree(lhs)
mode = self._pip_mode(lhs, preprocessor_result.final_rhs)
lhs_indices = lhs.index
# Duplicates the lhs polygon for each point in the final_rhs result
# that was computed by _preprocess. Will always ensure that the
# number of points in the rhs is equal to the number of polygons in the
# lhs.
if mode == "pairwise":
lhs_indices = preprocessor_result.point_indices
pip_result = contains_properly(
lhs, preprocessor_result.final_rhs, quadtree=use_quadtree
lhs[lhs_indices], preprocessor_result.final_rhs, mode=mode
)
# If the mode is pairwise, we need to replace the `pairwise_index`
# of each repeated polygon with the `part_index` from the
# preprocessor result.
if mode == "pairwise":
pairwise_index_df = cudf.DataFrame(
{
"pairwise_index": cp.arange(len(lhs_indices)),
"part_index": rhs.point_indices,
}
)
pip_result = pip_result.merge(
pairwise_index_df, on="pairwise_index"
)[["part_index", "point_index"]]
op_result = ContainsOpResult(pip_result, preprocessor_result)
return self._postprocess(lhs, rhs, preprocessor_result, op_result)

Expand Down Expand Up @@ -168,7 +195,7 @@ def _preprocess(self, lhs, rhs):
left and right hand side types. """
DispatchDict = {
(Point, Point): ContainsProperlyByIntersection,
(Point, MultiPoint): ImpossiblePredicate,
(Point, MultiPoint): ContainsProperlyByIntersection,
(Point, LineString): ImpossiblePredicate,
(Point, Polygon): ImpossiblePredicate,
(MultiPoint, Point): NotImplementedPredicate,
Expand Down
11 changes: 6 additions & 5 deletions python/cuspatial/cuspatial/core/binpreds/feature_crosses.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,12 @@ def _compute_predicate(self, lhs, rhs, preprocessor_result):
# intersection are in the boundary of the other
pli = _basic_intersects_pli(rhs, lhs)
intersections = _points_and_lines_to_multipoints(pli[1], pli[0])
equals = (_basic_equals_count(intersections, lhs) > 0) | (
_basic_equals_count(intersections, rhs) > 0
)
intersects = _basic_intersects_count(rhs, lhs) > 0
return intersects & ~equals
equals_lhs_count = _basic_equals_count(intersections, lhs)
equals_rhs_count = _basic_equals_count(intersections, rhs)
equals_lhs = equals_lhs_count != intersections.sizes
equals_rhs = equals_rhs_count != intersections.sizes
equals = equals_lhs & equals_rhs
return equals


class LineStringPolygonCrosses(BinPred):
Expand Down
15 changes: 9 additions & 6 deletions python/cuspatial/cuspatial/core/binpreds/feature_disjoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from cuspatial.core.binpreds.basic_predicates import (
_basic_contains_any,
_basic_equals_any,
_basic_intersects,
)
from cuspatial.core.binpreds.binpred_interface import (
Expand All @@ -23,13 +24,17 @@ def _preprocess(self, lhs, rhs):
and then negate the result.

Used by:
(Point, Point)
(Point, Polygon)
(Polygon, Point)
"""
return ~_basic_contains_any(lhs, rhs)


class PointPointDisjoint(BinPred):
def _preprocess(self, lhs, rhs):
return ~_basic_equals_any(lhs, rhs)


class PointLineStringDisjoint(BinPred):
def _preprocess(self, lhs, rhs):
"""Disjoint is the opposite of intersects, so just implement intersects
Expand All @@ -40,9 +45,8 @@ def _preprocess(self, lhs, rhs):

class PointPolygonDisjoint(BinPred):
def _preprocess(self, lhs, rhs):
intersects = _basic_intersects(lhs, rhs)
contains = _basic_contains_any(lhs, rhs)
return ~intersects & ~contains
return ~contains
thomcom marked this conversation as resolved.
Show resolved Hide resolved


class LineStringPointDisjoint(PointLineStringDisjoint):
Expand All @@ -61,9 +65,8 @@ def _postprocess(self, lhs, rhs, op_result):

class LineStringPolygonDisjoint(BinPred):
def _preprocess(self, lhs, rhs):
intersects = _basic_intersects(lhs, rhs)
contains = _basic_contains_any(rhs, lhs)
return ~intersects & ~contains
return ~contains
thomcom marked this conversation as resolved.
Show resolved Hide resolved


class PolygonPolygonDisjoint(BinPred):
Expand All @@ -72,7 +75,7 @@ def _preprocess(self, lhs, rhs):


DispatchDict = {
(Point, Point): DisjointByWayOfContains,
(Point, Point): PointPointDisjoint,
(Point, MultiPoint): NotImplementedPredicate,
(Point, LineString): PointLineStringDisjoint,
(Point, Polygon): PointPolygonDisjoint,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,11 @@ def _compute_predicate(self, lhs, rhs, preprocessor_result):
lhs_reversed, rhs_lengths_equal.lines.xy
)
result = forward_result | reverse_result
original_point_indices = cudf.Series(
lhs_lengths_equal.point_indices
).replace(cudf.Series(lhs_lengths_equal.index))
return self._postprocess(
lhs, rhs, EqualsOpResult(result, lhs_lengths_equal.point_indices)
lhs, rhs, EqualsOpResult(result, original_point_indices)
)


Expand Down
Loading