Skip to content

Commit

Permalink
invertedidx: derive geog/geom filters which enable inverted index scan
Browse files Browse the repository at this point in the history
The query normalization rules FoldEqZeroSTDistance,
FoldCmpSTDistanceLeft, FoldCmpSTDistanceRight, FoldCmpSTMaxDistanceLeft
and FoldCmpSTMaxDistanceRight replace binary comparison operations,
=, <, >, <=, >= involving the `st_distance` or `st_maxdistance` functions
with equivalent function calls which may enable inverted index scan, for
example st_dwithin, and st_dfullywithin. For non-null and non-empty
geographies and geometries, this rewrite is semantically equivalent, but
for null or empty inputs, `st_distance` uses three-valued boolean logic
and returns a null. `st_dwithin` always returns true or false, so may
return a different value when the expression is negated. For example,
the following test returns `true`, but should return `null`:
```sql
CREATE TABLE g1 (geog GEOGRAPHY);
INSERT INTO g1 VALUES (ST_GeogFromText('MULTILINESTRING EMPTY'));
SELECT ST_Distance('POINT(0 0)'::GEOGRAPHY, geog) > 1 AS predicate_result FROM g1;
  predicate_result
--------------------
         t
```

The solution is to move all of the logic in the above normalization
rules into `geoFilterPlanner.extractInvertedFilterConditionFromLeaf`,
and use the derived predicate only for the purposes of finding inverted
index scan spans. The spans will always include all rows qualified for
the original predicate, but may include additional rows. The original
filter is then applied to the results of the inverted index scan to
filter out any results with nulls or empty geographies and geometries.

Epic: none
Fixes: cockroachdb#103616

Release note (bug fix): This patch fixes a bug in geospatial queries,
where a query filter of the form `ST_Distance(geog1, geog2) > constant`,
or `ST_MaxDistance(geom1, geom2) > constant`, where the operator is one
of >, <, >=, <=, or a filter of the form `ST_Distance(geog1, geog2, false) = 0`
may mistakenly evaluate to `true` when one or both of the inputs is null
or an empty geography/geometry. More rows could be returned by the query
than expected.
  • Loading branch information
Mark Sirek committed Aug 18, 2023
1 parent 673bde5 commit 04eed83
Show file tree
Hide file tree
Showing 6 changed files with 811 additions and 216 deletions.
155 changes: 155 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/geospatial
Original file line number Diff line number Diff line change
Expand Up @@ -6008,3 +6008,158 @@ SELECT ST_AsText(ST_AsMVTGeom(
ST_GeomFromText('LINESTRING (0 0, 10.6 0.4, 10.6 5.4, 0 -5, 0 0)'),
ST_MakeBox2D(ST_Point(0, 0), ST_Point(20, 20)),
20, -1, false))

subtest regression_103616

# Regression test for #103616
statement ok
CREATE TABLE t103616 (
tag string,
geog GEOGRAPHY,
geom GEOMETRY,
INVERTED INDEX geog_idx (geog),
INVERTED INDEX geom_idx (geom)
);

statement ok
insert into t103616 values ('null', null, null);

statement ok
insert into t103616 values ('point (0 0)', 'POINT(0 0)'::GEOGRAPHY, 'POINT(0 0)'::GEOMETRY);

statement ok
insert into t103616 values ('point (10 10)', 'POINT(10 10)'::GEOGRAPHY, 'POINT(10 10)'::GEOMETRY);

statement ok
insert into t103616 values ('empty 1', ST_GeogFromText('POLYGON EMPTY'), ST_GeomFromText('POLYGON EMPTY'));

statement ok
insert into t103616 values ('empty 2', ST_GeogFromText('GEOMETRYCOLLECTION EMPTY'), ST_GeomFromText('GEOMETRYCOLLECTION EMPTY'));

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geog_idx
WHERE
st_distance(geog, 'POINT(0 0)'::GEOGRAPHY, false) = 0;
----
point (0 0)

# Null and empty geom values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geog_idx
WHERE
st_distance(geog, 'POINT(0 0)'::GEOGRAPHY, false) = 0;
----
point (0 0)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geog_idx
WHERE
NOT 1.2345678901234566e-43 < st_distance(geog, 'POINT(0 0)'::GEOGRAPHY);
----
point (0 0)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
NOT 1.2345678901234566e-43 > st_distance(geog, 'POINT(0 0)'::GEOGRAPHY);
----
point (10 10)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geog_idx
WHERE
1.2345678901234566e-43 > st_distance(geog, 'POINT(0 0)'::GEOGRAPHY);
----
point (0 0)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
1.2345678901234566e-43 < st_distance(geog, 'POINT(0 0)'::GEOGRAPHY);
----
point (10 10)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
0 <= st_distance(geog, 'POINT(0 0)'::GEOGRAPHY);
----
point (0 0)
point (10 10)

# Only null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
(st_distance(geog, 'POINT(0 0)'::GEOGRAPHY, false) = 0) IS NULL;
----
null
empty 1
empty 2

# Only null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
(0 <= st_distance(geog, 'POINT(0 0)'::GEOGRAPHY)) IS NULL;
----
null
empty 1
empty 2

# Null and empty geom values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geom_idx
WHERE
NOT 1.2345678901234566e-43 < st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY);
----
point (0 0)

# Null and empty geom values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
NOT 1.2345678901234566e-43 > st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY);
----
point (10 10)

# Null and empty geom values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geom_idx
WHERE
1.2345678901234566e-43 > st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY);
----
point (0 0)

# Null and empty geom values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
1.2345678901234566e-43 < st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY);
----
point (10 10)

# Null and empty geom values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
0 <= st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY);
----
point (0 0)
point (10 10)

# Only null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
(0 <= st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY)) IS NULL;
----
null
empty 1
empty 2
103 changes: 101 additions & 2 deletions pkg/sql/opt/invertedidx/geo.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,99 @@ type geoFilterPlanner struct {

var _ invertedFilterPlanner = &geoFilterPlanner{}

// maybeDeriveUsefulInvertedFilterCondition identifies an expression of the
// form: 'st_distance(a, b, bool) = 0', 'st_distance(...) <= x' or
// 'st_maxdistance(...) <= x', and returns a function call to st_dwithin,
// st_dwithinexclusive, st_dfullywithin, st_dfullywithinexclusive or
// st_intersects which is almost equivalent to the original expression, except
// for empty or null geography/geometry inputs (it may evaluate to true in more
// cases). The derived expression may enable use of an inverted index scan. See
// the MakeSTDWithin, MakeSTDFullyWithin or MakeIntersectionFunction method for
// the specific function that is used to replace expressions with different
// comparison operators (e.g. '<' vs '<='). Note that the `st_distance` or
// `st_maxdistance` may be on the left or right of the comparison operation (LT,
// GT, LE, GE).
func (g *geoFilterPlanner) maybeDeriveUsefulInvertedFilterCondition(
expr opt.ScalarExpr,
) (opt.ScalarExpr, bool) {
var op opt.Operator
var left, right opt.ScalarExpr
var function *memo.FunctionExpr
leftIsFunction := false
rightIsFunction := false
switch t := expr.(type) {
case *memo.EqExpr:
left = t.Left
right = t.Right
function, leftIsFunction = left.(*memo.FunctionExpr)
if !leftIsFunction {
return expr, false
}
private := &function.FunctionPrivate
if private.Name != "st_distance" {
return expr, false
}
if g.factory.CustomFuncs().STDistanceUseSpheroid(function.Args) {
return expr, false
}
constant, rightIsConstant := right.(*memo.ConstExpr)
if !rightIsConstant {
return expr, false
}
value := constant.Value
if !g.factory.CustomFuncs().IsFloatDatum(value) {
return expr, false
}
if !g.factory.CustomFuncs().DatumsEqual(value, tree.NewDInt(0)) {
return expr, false
}
return g.factory.CustomFuncs().MakeIntersectionFunction(function.Args), true
case *memo.LtExpr, *memo.GtExpr, *memo.LeExpr, *memo.GeExpr:
left = t.Child(0).(opt.ScalarExpr)
right = t.Child(1).(opt.ScalarExpr)
function, leftIsFunction = left.(*memo.FunctionExpr)
if !leftIsFunction {
function, rightIsFunction = right.(*memo.FunctionExpr)
if !rightIsFunction {
return expr, false
}
}
// Combinations which result in a `NOT st_d*` function would not enable
// inverted index scan, so no need to derive filters for these cases.
if leftIsFunction && (t.Op() == opt.GtOp || t.Op() == opt.GeOp) {
return expr, false
} else if rightIsFunction && (t.Op() == opt.LtOp || t.Op() == opt.LeOp) {
return expr, false
}
op = t.Op()
// Main logic below to eliminate a code nesting level.
default:
return expr, false
}

if function == nil {
return expr, false
}
args := function.Args
private := &function.FunctionPrivate
if private.Name != "st_distance" && private.Name != "st_maxdistance" {
return expr, false
}
var makeSTDWithinFunc func(op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr, fnIsLeftArg bool) opt.ScalarExpr
var boundExpr opt.ScalarExpr
if leftIsFunction {
boundExpr = right
} else {
boundExpr = left
}
if private.Name == "st_distance" {
makeSTDWithinFunc = g.factory.CustomFuncs().MakeSTDWithin
} else {
makeSTDWithinFunc = g.factory.CustomFuncs().MakeSTDFullyWithin
}
return makeSTDWithinFunc(op, args, boundExpr, leftIsFunction), true
}

// extractInvertedFilterConditionFromLeaf is part of the invertedFilterPlanner
// interface.
func (g *geoFilterPlanner) extractInvertedFilterConditionFromLeaf(
Expand All @@ -344,6 +437,9 @@ func (g *geoFilterPlanner) extractInvertedFilterConditionFromLeaf(
_ *invertedexpr.PreFiltererStateForInvertedFilterer,
) {
var args memo.ScalarListExpr
filterIsDerived := false
originalExpr := expr
expr, filterIsDerived = g.maybeDeriveUsefulInvertedFilterCondition(expr)
switch t := expr.(type) {
case *memo.FunctionExpr:
args = t.Args
Expand Down Expand Up @@ -382,8 +478,11 @@ func (g *geoFilterPlanner) extractInvertedFilterConditionFromLeaf(
ctx, g.factory, expr, args, true /* commuteArgs */, g.tabID, g.index, g.getSpanExpr,
)
}
if !invertedExpr.IsTight() {
remainingFilters = expr
// A derived filter may not be semantically equivalent to the original, so we
// need to apply the original filter in that case, the same as when the
// inverted expression is not tight.
if !invertedExpr.IsTight() || filterIsDerived {
remainingFilters = originalExpr
}
return invertedExpr, remainingFilters, pfState
}
Expand Down
45 changes: 11 additions & 34 deletions pkg/sql/opt/norm/comp_funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,47 +177,24 @@ func (c *CustomFuncs) MakeIntersectionFunction(args memo.ScalarListExpr) opt.Sca
)
}

// MakeSTDWithinLeft returns an ST_DWithin function that replaces an expression
// of the following form: ST_Distance(a,b) <= x. Note that the ST_Distance
// function is on the left side of the inequality.
func (c *CustomFuncs) MakeSTDWithinLeft(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr,
// MakeSTDWithin returns an ST_DWithin function that replaces an expression
// of the form: ST_Distance(a,b) <= x or x <= ST_Distance(a,b). fnIsLeftArg should
// be passed as true if the ST_Distance is on the left, otherwise false.
func (c *CustomFuncs) MakeSTDWithin(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr, fnIsLeftArg bool,
) opt.ScalarExpr {
const fnName = "st_dwithin"
const fnIsLeftArg = true
return c.makeSTDWithin(op, args, bound, fnName, fnIsLeftArg)
}

// MakeSTDWithinRight returns an ST_DWithin function that replaces an expression
// of the following form: x <= ST_Distance(a,b). Note that the ST_Distance
// function is on the right side of the inequality.
func (c *CustomFuncs) MakeSTDWithinRight(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr,
) opt.ScalarExpr {
const fnName = "st_dwithin"
const fnIsLeftArg = false
return c.makeSTDWithin(op, args, bound, fnName, fnIsLeftArg)
}

// MakeSTDFullyWithinLeft returns an ST_DFullyWithin function that replaces an
// expression of the following form: ST_MaxDistance(a,b) <= x. Note that the
// ST_MaxDistance function is on the left side of the inequality.
func (c *CustomFuncs) MakeSTDFullyWithinLeft(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr,
) opt.ScalarExpr {
const fnName = "st_dfullywithin"
const fnIsLeftArg = true
return c.makeSTDWithin(op, args, bound, fnName, fnIsLeftArg)
}

// MakeSTDFullyWithinRight returns an ST_DFullyWithin function that replaces an
// expression of the following form: x <= ST_MaxDistance(a,b). Note that the
// ST_MaxDistance function is on the right side of the inequality.
func (c *CustomFuncs) MakeSTDFullyWithinRight(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr,
// MakeSTDFullyWithin returns an ST_DFullyWithin function that replaces an
// expression of the form: ST_MaxDistance(a,b) <= x or x <= ST_MaxDistance(a,b).
// fnIsLeftArg should be passed as true if the ST_MaxDistance is on the left,
// otherwise false.
func (c *CustomFuncs) MakeSTDFullyWithin(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr, fnIsLeftArg bool,
) opt.ScalarExpr {
const fnName = "st_dfullywithin"
const fnIsLeftArg = false
return c.makeSTDWithin(op, args, bound, fnName, fnIsLeftArg)
}

Expand Down
Loading

0 comments on commit 04eed83

Please sign in to comment.