Skip to content

Commit

Permalink
invertedidx: derive geog/geom filters which enable inverted index scan
Browse files Browse the repository at this point in the history
The query normalization rules FoldEqZeroSTDistance,
FoldCmpSTDistanceLeft, FoldCmpSTDistanceRight, FoldCmpSTMaxDistanceLeft
and FoldCmpSTMaxDistanceRight replace binary comparison operations,
=, <, >, <=, >= involving the `st_distance` or `st_maxdistance` functions
with equivalent function calls which may enable inverted index scan, for
example st_dwithin, and st_dfullywithin. For non-null and non-empty
geographies and geometries, this rewrite is semantically equivalent, but
for null or empty inputs, `st_distance` uses three-valued boolean logic
and returns a null. `st_dwithin` always returns true or false, so may
return a different value when the expression is negated. For example,
the following test returns `true`, but should return `null`:
```sql
CREATE TABLE g1 (geog GEOGRAPHY);
INSERT INTO g1 VALUES (ST_GeogFromText('MULTILINESTRING EMPTY'));
SELECT ST_Distance('POINT(0 0)'::GEOGRAPHY, geog) > 1 AS predicate_result FROM g1;
  predicate_result
--------------------
         t
```

The solution is to move all of the logic in the above normalization
rules into `geoFilterPlanner.extractInvertedFilterConditionFromLeaf`,
and use the derived predicate only for the purposes of finding inverted
index scan spans. The spans will always include all rows qualified for
the original predicate, but may include additional rows. The original
filter is then applied to the results of the inverted index scan to
filter out any results with nulls or empty geographies and geometries.

Epic: none
Fixes: cockroachdb#103616

Release note (bug fix): This patch fixes a bug in geospatial queries,
where a query filter of the form `ST_Distance(geog1, geog2) > constant`,
or `ST_MaxDistance(geom1, geom2) > constant`, where the operator is one
of >, <, >=, <=, or a filter of the form `ST_Distance(geog1, geog2, false) = 0`
may mistakenly evaluate to `true` when one or both of the inputs is null
or an empty geography/geometry. More rows could be returned by the query
than expected.
  • Loading branch information
Mark Sirek committed Aug 24, 2023
1 parent ba4b4e7 commit da8caaf
Show file tree
Hide file tree
Showing 6 changed files with 825 additions and 244 deletions.
156 changes: 156 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/geospatial
Original file line number Diff line number Diff line change
Expand Up @@ -5969,3 +5969,159 @@ POLYGON ((30.010000000000002 50.009999999999998, 30.010000000000002 52.009999999

statement error 'GeometryCollection' geometry type not supported
SELECT st_asencodedpolyline(geomfromewkb(('01070000A0E61000000300000001060000A0E6100000040000000103000080010000000B0000006F9AF3F7D81863C057397A0B3ACD42C006ED79D13D2DF24130E1CBF62A4A45C01B6DC0D279F452C022F8C672D2C1F9C140DB9D23050141C0B53FBA6E396656C0C0A1842E2B53EC413A8DC94C568D61401BEF0352091651C05C681DE175D2FD41A048C859DF7C5440C00E327AC97FF63F68BBA8144D0EF8416E6042F340DD6040805791CE56602740DC6C3922676401C29C6D4009B1334E40625E855D7A7E5140D4241F7D86D2D5C14B1E4BCA077A58C07806BF9D4D084D4074E74BE4DCAA00C2C11D020045A051C0D81F8E3DD0AE3740B8315FF7F63AF4C1BAD97494BA9860C07411EFAF0A6C3D4038E26DE7A194F4C16F9AF3F7D81863C057397A0B3ACD42C006ED79D13D2DF2410103000080010000000B000000610D90AE22A55FC0B8265672F7562940CE3E91EB7A320042CC96C05617B649C085229B1A09E153C0D84FA7998764EB4190FEB5A7C71B2DC03EBBCB6B588F4DC02891E1ED4A7DE6417C0F7BBC14C75840E08D4D210E2F21C04180E453EDEDFFC112E57185B2435240F0D20FC2D7C11640F8BF2E3AB98DD2C1D0A9A94D6EA14140FCABD1E5BF074B400010C2F75AA16F4166BC301BE6FB55C002F9F182DE3D53400038B551BC0495C199463681FA6E5CC0184D3E0AE179554030323F0F2525F4415791FBE04FEA61C0361E8733C77A5240C84AEBEAD466EEC1D5012A75E04F63C0A8BF403F623D4240182CFEFA9071E0C1610D90AE22A55FC0B8265672F7562940CE3E91EB7A3200420103000080010000000C000000004CADA6B302E0BF3EE5FA281ACE3BC0E051ABD6580901429B65A22CCBDC53C003DEBC27260E4AC07E45691D3E7AF541408E6FAB9A1008C016E33E2631FD40C0EEF80F0FF3E8F341A8C5EAA8B0C836409E85B8193CE952C0F4448FFF9B31F24190F9A9FCD2AC604004F99C51ED4356C0903A1F652902D4C1C01B737BD389554080315427550738C05C51AE075475F64124F6F92382235C40148AB5FED33D38C048960D3AC6D7E8412C2E310EABA5514058EA10C427562EC0A82D4D1A69D7FE41621FC77070C26540FE84B78FE5BE51400962385E53FAFEC1483899F770B14DC020BD4BBBAB604D401C08306EA143E84140337962764325C0002F089A1DF3F83FAC9B66F32C2BE841004CADA6B302E0BF3EE5FA281ACE3BC0E051ABD65809014201030000800100000006000000D03FF366A2C95C40E59BFCECDD7553C0D028B038BB63D041A40AEF5CB0195840006FB928107F084020CA0A39A4AED34100F09D4126336140E8AC65DAC4954C4024406D4040D1EE41F04B3F3DA6C75740106594C0ADCE4440A498B6372039FD4155D08A1E777452C084CC004170E949409C87DF53EB9AFDC1D03FF366A2C95C40E59BFCECDD7553C0D028B038BB63D04101020000A0E61000000200000074F55D784E5F50408F7DE14351CD41C0FCCCE7B1B166F541583C392FCD7554409022EAF3EFCC39C03CDE1C0E29A7EB4101040000A0E6100000060000000101000080E8801DFC8A2130C04401B4AE182F50C0BC13340EB7E2E1C1010100008010DC3F808D824D406C9927CDAE5144409A464D93BBD7F94101010000806F0682A50FE253C030E1964D47F74BC00E752ECB60B0F441010100008014F5242FC2C444C01257DB8E51FA51404449E2FB7D2AFFC1010100008069945BA1FD2E50C0501CA74B7A661C40CB83CB9A4894F1C10101000080C0BBBC345A4B51C0E08F6EBB2F250E400681A78F4FFCF741'::GEOGRAPHY)::BYTEA)::GEOMETRY);

subtest regression_103616

# Regression test for #103616
statement ok
CREATE TABLE t103616 (
tag string,
geog GEOGRAPHY,
geom GEOMETRY,
INVERTED INDEX geog_idx (geog),
INVERTED INDEX geom_idx (geom)
);

statement ok
insert into t103616 values ('null', null, null);

statement ok
insert into t103616 values ('point (0 0)', 'POINT(0 0)'::GEOGRAPHY, 'POINT(0 0)'::GEOMETRY);

statement ok
insert into t103616 values ('point (10 10)', 'POINT(10 10)'::GEOGRAPHY, 'POINT(10 10)'::GEOMETRY);

statement ok
insert into t103616 values ('empty 1', ST_GeogFromText('POLYGON EMPTY'), ST_GeomFromText('POLYGON EMPTY'));

statement ok
insert into t103616 values ('empty 2', ST_GeogFromText('GEOMETRYCOLLECTION EMPTY'), ST_GeomFromText('GEOMETRYCOLLECTION EMPTY'));

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geog_idx
WHERE
st_distance(geog, 'POINT(0 0)'::GEOGRAPHY, false) = 0;
----
point (0 0)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geog_idx
WHERE
st_distance(geog, 'POINT(0 0)'::GEOGRAPHY, false) = 0;
----
point (0 0)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geog_idx
WHERE
NOT 1.2345678901234566e-43 < st_distance(geog, 'POINT(0 0)'::GEOGRAPHY);
----
point (0 0)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
NOT 1.2345678901234566e-43 > st_distance(geog, 'POINT(0 0)'::GEOGRAPHY);
----
point (10 10)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geog_idx
WHERE
1.2345678901234566e-43 > st_distance(geog, 'POINT(0 0)'::GEOGRAPHY);
----
point (0 0)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
1.2345678901234566e-43 < st_distance(geog, 'POINT(0 0)'::GEOGRAPHY);
----
point (10 10)

# Null and empty geog values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
0 <= st_distance(geog, 'POINT(0 0)'::GEOGRAPHY);
----
point (0 0)
point (10 10)

# Only null and empty geog values should appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
(st_distance(geog, 'POINT(0 0)'::GEOGRAPHY, false) = 0) IS NULL;
----
null
empty 1
empty 2

# Only null and empty geog values should appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
(0 <= st_distance(geog, 'POINT(0 0)'::GEOGRAPHY)) IS NULL;
----
null
empty 1
empty 2

# Null and empty geom values should appear in the output.
query T rowsort
SELECT tag FROM t103616@geom_idx
WHERE
NOT 1.2345678901234566e-43 < st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY);
----
point (0 0)

# Null and empty geom values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
NOT 1.2345678901234566e-43 > st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY);
----
point (10 10)

# Null and empty geom values should not appear in the output.
query T rowsort
SELECT tag FROM t103616@geom_idx
WHERE
1.2345678901234566e-43 > st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY);
----
point (0 0)

# Null and empty geom values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
1.2345678901234566e-43 < st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY);
----
point (10 10)

# Null and empty geom values should not appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
0 <= st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY);
----
point (0 0)
point (10 10)

# Only null and empty geog values should appear in the output.
query T rowsort
SELECT tag FROM t103616
WHERE
(0 <= st_maxdistance(geom, 'POINT(0 0)'::GEOMETRY)) IS NULL;
----
null
empty 1
empty 2

98 changes: 96 additions & 2 deletions pkg/sql/opt/invertedidx/geo.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,94 @@ type geoFilterPlanner struct {

var _ invertedFilterPlanner = &geoFilterPlanner{}

// maybeDeriveUsefulInvertedFilterCondition identifies an expression of the
// form: 'st_distance(a, b, bool) = 0', 'st_distance(...) <= x' or
// 'st_maxdistance(...) <= x', and returns a function call to st_dwithin,
// st_dwithinexclusive, st_dfullywithin, st_dfullywithinexclusive or
// st_intersects which is almost equivalent to the original expression, except
// for empty or null geography/geometry inputs (it may evaluate to true in more
// cases). The derived expression may enable use of an inverted index scan. See
// the MaybeMakeSTDWithin or MakeIntersectionFunction method for the specific
// function that is used to replace expressions with different comparison
// operators (e.g. '<' vs '<='). Note that the `st_distance` or `st_maxdistance`
// may be on the left or right of the comparison operation (LT, GT, LE, GE).
func (g *geoFilterPlanner) maybeDeriveUsefulInvertedFilterCondition(
expr opt.ScalarExpr,
) (opt.ScalarExpr, bool) {
var left, right opt.ScalarExpr
var function *memo.FunctionExpr
leftIsFunction := false
rightIsFunction := false
c := g.factory.CustomFuncs()
switch t := expr.(type) {
case *memo.EqExpr:
left = t.Left
right = t.Right
function, leftIsFunction = left.(*memo.FunctionExpr)
if !leftIsFunction {
return expr, false
}
private := &function.FunctionPrivate
if private.Name != "st_distance" {
return expr, false
}
if c.STDistanceUseSpheroid(function.Args) {
return expr, false
}
constant, rightIsConstant := right.(*memo.ConstExpr)
if !rightIsConstant {
return expr, false
}
value := constant.Value
if !c.IsFloatDatum(value) {
return expr, false
}
if !c.DatumsEqual(value, tree.NewDInt(0)) {
return expr, false
}
return c.MakeIntersectionFunction(function.Args), true
case *memo.LtExpr, *memo.GtExpr, *memo.LeExpr, *memo.GeExpr:
left = t.Child(0).(opt.ScalarExpr)
right = t.Child(1).(opt.ScalarExpr)
function, leftIsFunction = left.(*memo.FunctionExpr)
if !leftIsFunction {
function, rightIsFunction = right.(*memo.FunctionExpr)
if !rightIsFunction {
return expr, false
}
}
// Combinations which result in a `NOT st_d*` function would not enable
// inverted index scan, so no need to derive filters for these cases.
if leftIsFunction && (t.Op() == opt.GtOp || t.Op() == opt.GeOp) {
return expr, false
} else if rightIsFunction && (t.Op() == opt.LtOp || t.Op() == opt.LeOp) {
return expr, false
}
// Main logic below to eliminate a code nesting level.
default:
return expr, false
}

if function == nil {
return expr, false
}
args := function.Args
private := &function.FunctionPrivate
if private.Name != "st_distance" && private.Name != "st_maxdistance" {
return expr, false
}
var boundExpr opt.ScalarExpr
if leftIsFunction {
boundExpr = right
} else {
boundExpr = left
}
if private.Name == "st_distance" {
return c.MaybeMakeSTDWithin(expr, args, boundExpr, leftIsFunction, false /* fullyWithin */)
}
return c.MaybeMakeSTDWithin(expr, args, boundExpr, leftIsFunction, true /* fullyWithin */)
}

// extractInvertedFilterConditionFromLeaf is part of the invertedFilterPlanner
// interface.
func (g *geoFilterPlanner) extractInvertedFilterConditionFromLeaf(
Expand All @@ -344,6 +432,9 @@ func (g *geoFilterPlanner) extractInvertedFilterConditionFromLeaf(
_ *invertedexpr.PreFiltererStateForInvertedFilterer,
) {
var args memo.ScalarListExpr
filterIsDerived := false
originalExpr := expr
expr, filterIsDerived = g.maybeDeriveUsefulInvertedFilterCondition(expr)
switch t := expr.(type) {
case *memo.FunctionExpr:
args = t.Args
Expand Down Expand Up @@ -382,8 +473,11 @@ func (g *geoFilterPlanner) extractInvertedFilterConditionFromLeaf(
ctx, g.factory, expr, args, true /* commuteArgs */, g.tabID, g.index, g.getSpanExpr,
)
}
if !invertedExpr.IsTight() {
remainingFilters = expr
// A derived filter may not be semantically equivalent to the original, so we
// need to apply the original filter in that case, the same as when the
// inverted expression is not tight.
if !invertedExpr.IsTight() || filterIsDerived {
remainingFilters = originalExpr
}
return invertedExpr, remainingFilters, pfState
}
Expand Down
91 changes: 29 additions & 62 deletions pkg/sql/opt/norm/comp_funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,61 +177,27 @@ func (c *CustomFuncs) MakeIntersectionFunction(args memo.ScalarListExpr) opt.Sca
)
}

// MakeSTDWithinLeft returns an ST_DWithin function that replaces an expression
// of the following form: ST_Distance(a,b) <= x. Note that the ST_Distance
// function is on the left side of the inequality.
func (c *CustomFuncs) MakeSTDWithinLeft(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr,
) opt.ScalarExpr {
const fnName = "st_dwithin"
const fnIsLeftArg = true
return c.makeSTDWithin(op, args, bound, fnName, fnIsLeftArg)
}

// MakeSTDWithinRight returns an ST_DWithin function that replaces an expression
// of the following form: x <= ST_Distance(a,b). Note that the ST_Distance
// function is on the right side of the inequality.
func (c *CustomFuncs) MakeSTDWithinRight(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr,
) opt.ScalarExpr {
const fnName = "st_dwithin"
const fnIsLeftArg = false
return c.makeSTDWithin(op, args, bound, fnName, fnIsLeftArg)
}

// MakeSTDFullyWithinLeft returns an ST_DFullyWithin function that replaces an
// expression of the following form: ST_MaxDistance(a,b) <= x. Note that the
// ST_MaxDistance function is on the left side of the inequality.
func (c *CustomFuncs) MakeSTDFullyWithinLeft(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr,
) opt.ScalarExpr {
const fnName = "st_dfullywithin"
const fnIsLeftArg = true
return c.makeSTDWithin(op, args, bound, fnName, fnIsLeftArg)
}

// MakeSTDFullyWithinRight returns an ST_DFullyWithin function that replaces an
// expression of the following form: x <= ST_MaxDistance(a,b). Note that the
// ST_MaxDistance function is on the right side of the inequality.
func (c *CustomFuncs) MakeSTDFullyWithinRight(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr,
) opt.ScalarExpr {
const fnName = "st_dfullywithin"
const fnIsLeftArg = false
return c.makeSTDWithin(op, args, bound, fnName, fnIsLeftArg)
}

// makeSTDWithin returns an ST_DWithin (or ST_DFullyWithin) function that
// replaces an expression of the following form: ST_Distance(a,b) <= x. The
// ST_Distance function can be on either side of the inequality, and the
// inequality can be one of the following: '<', '<=', '>', '>='. This
// replacement allows early-exit behavior, and may enable use of an inverted
// index scan.
func (c *CustomFuncs) makeSTDWithin(
op opt.Operator, args memo.ScalarListExpr, bound opt.ScalarExpr, fnName string, fnIsLeftArg bool,
) opt.ScalarExpr {
// MaybeMakeSTDWithin attempts to derive an ST_DWithin (or ST_DFullyWithin)
// function that is similar to an expression of the following form:
// ST_Distance(a,b) <= x. The ST_Distance function can be on either side of the
// inequality, and the inequality can be one of the following: '<', '<=', '>',
// '>='. This replacement allows early-exit behavior, and may enable use of an
// inverted index scan. If the derived expression would need to be negated with
// a NotExpr, so the derivation fails, and expr, ok=false is returned.
func (c *CustomFuncs) MaybeMakeSTDWithin(
expr opt.ScalarExpr,
args memo.ScalarListExpr,
bound opt.ScalarExpr,
fnIsLeftArg bool,
fullyWithin bool,
) (derivedExpr opt.ScalarExpr, ok bool) {
op := expr.Op()
var not bool
var name string
fnName := "st_dwithin"
if fullyWithin {
fnName = "st_dfullywithin"
}
incName := fnName
exName := fnName + "exclusive"
switch op {
Expand Down Expand Up @@ -279,7 +245,15 @@ func (c *CustomFuncs) makeSTDWithin(
name = incName
}
}

if not {
// ST_DWithin and ST_DWithinExclusive are equivalent to ST_Distance <= x and
// ST_Distance < x respectively. The comparison operator in the matched
// expression (if ST_Distance is normalized to be on the left) is either '>'
// or '>='. Therefore, we would have to take the opposite of within. This
// would not result in a useful expression for inverted index scan, so return
// ok=false.
return expr, false
}
newArgs := make(memo.ScalarListExpr, len(args)+1)
const distanceIdx, useSpheroidIdx = 2, 3
copy(newArgs, args[:distanceIdx])
Expand All @@ -302,12 +276,5 @@ func (c *CustomFuncs) makeSTDWithin(
Properties: props,
Overload: overload,
})
if not {
// ST_DWithin and ST_DWithinExclusive are equivalent to ST_Distance <= x and
// ST_Distance < x respectively. The comparison operator in the matched
// expression (if ST_Distance is normalized to be on the left) is either '>'
// or '>='. Therefore, we have to take the opposite of within.
within = c.f.ConstructNot(within)
}
return within
return within, true
}
Loading

0 comments on commit da8caaf

Please sign in to comment.