Skip to content

Commit

Permalink
Fixes case handling in regex simplification. (#1950)
Browse files Browse the repository at this point in the history
Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>
  • Loading branch information
cyriltovena authored Apr 16, 2020
1 parent e4d78b9 commit 15c4ebb
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 51 deletions.
43 changes: 29 additions & 14 deletions pkg/logql/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,21 +116,33 @@ func (r regexpFilter) Filter(line []byte) bool {
return r.Match(line)
}

type containsFilter []byte
type containsFilter struct {
match []byte
caseInsensitive bool
}

func (l containsFilter) Filter(line []byte) bool {
return bytes.Contains(line, l)
if l.caseInsensitive {
line = bytes.ToLower(line)
}
return bytes.Contains(line, l.match)
}

func (l containsFilter) String() string {
return string(l)
return string(l.match)
}

func newContainsFilter(match string) LineFilter {
if match == "" {
func newContainsFilter(match []byte, caseInsensitive bool) LineFilter {
if len(match) == 0 {
return TrueFilter
}
return containsFilter(match)
if caseInsensitive {
match = bytes.ToLower(match)
}
return containsFilter{
match: match,
caseInsensitive: caseInsensitive,
}
}

// newFilter creates a new line filter from a match string and type.
Expand All @@ -141,9 +153,9 @@ func newFilter(match string, mt labels.MatchType) (LineFilter, error) {
case labels.MatchNotRegexp:
return parseRegexpFilter(match, false)
case labels.MatchEqual:
return newContainsFilter(match), nil
return newContainsFilter([]byte(match), false), nil
case labels.MatchNotEqual:
return newNotFilter(newContainsFilter(match)), nil
return newNotFilter(newContainsFilter([]byte(match), false)), nil
default:
return nil, fmt.Errorf("unknown matcher: %v", match)
}
Expand Down Expand Up @@ -181,7 +193,7 @@ func simplify(reg *syntax.Regexp) (LineFilter, bool) {
clearCapture(reg)
return simplify(reg)
case syntax.OpLiteral:
return containsFilter(string(reg.Rune)), true
return newContainsFilter([]byte(string((reg.Rune))), isCaseInsensitive(reg)), true
case syntax.OpStar:
if reg.Sub[0].Op == syntax.OpAnyCharNotNL {
return TrueFilter, true
Expand All @@ -192,6 +204,10 @@ func simplify(reg *syntax.Regexp) (LineFilter, bool) {
return nil, false
}

func isCaseInsensitive(reg *syntax.Regexp) bool {
return (reg.Flags & syntax.FoldCase) != 0
}

// clearCapture removes capture operation as they are not used for filtering.
func clearCapture(regs ...*syntax.Regexp) {
for _, r := range regs {
Expand Down Expand Up @@ -267,8 +283,7 @@ func simplifyConcat(reg *syntax.Regexp, baseLiteral []byte) (LineFilter, bool) {

// if we have only a concat with literals.
if baseLiteral != nil {
return containsFilter(baseLiteral), true

return newContainsFilter(baseLiteral, isCaseInsensitive(reg)), true
}

return nil, false
Expand All @@ -282,14 +297,14 @@ func simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr LineFilter
for _, alt := range reg.Sub {
switch alt.Op {
case syntax.OpEmptyMatch:
curr = chainOrFilter(curr, containsFilter(literal))
curr = chainOrFilter(curr, newContainsFilter(literal, isCaseInsensitive(reg)))
case syntax.OpLiteral:
// concat the root literal with the alternate one.
altBytes := []byte(string(alt.Rune))
altLiteral := make([]byte, 0, len(literal)+len(altBytes))
altLiteral = append(altLiteral, literal...)
altLiteral = append(altLiteral, altBytes...)
curr = chainOrFilter(curr, containsFilter(altLiteral))
curr = chainOrFilter(curr, newContainsFilter(altLiteral, isCaseInsensitive(reg)))
case syntax.OpConcat:
f, ok := simplifyConcat(alt, literal)
if !ok {
Expand All @@ -300,7 +315,7 @@ func simplifyConcatAlternate(reg *syntax.Regexp, literal []byte, curr LineFilter
if alt.Sub[0].Op != syntax.OpAnyCharNotNL {
return nil, false
}
curr = chainOrFilter(curr, containsFilter(literal))
curr = chainOrFilter(curr, newContainsFilter(literal, isCaseInsensitive(reg)))
default:
return nil, false
}
Expand Down
77 changes: 40 additions & 37 deletions pkg/logql/filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (

func Test_SimplifiedRegex(t *testing.T) {
fixtures := []string{
"foo", "foobar", "bar", "foobuzz", "buzz", "f", " ", "fba", "foofoofoo", "b", "foob", "bfoo",
"foo", "foobar", "bar", "foobuzz", "buzz", "f", " ", "fba", "foofoofoo", "b", "foob", "bfoo", "FoO",
}
for _, test := range []struct {
re string
Expand All @@ -18,40 +18,41 @@ func Test_SimplifiedRegex(t *testing.T) {
match bool
}{
// regex we intend to support.
{"foo", true, containsFilter("foo"), true},
{"not", true, newNotFilter(containsFilter("not")), false},
{"(foo)", true, containsFilter("foo"), true},
{"(foo|ba)", true, newOrFilter(containsFilter("foo"), containsFilter("ba")), true},
{"(foo|ba|ar)", true, newOrFilter(newOrFilter(containsFilter("foo"), containsFilter("ba")), containsFilter("ar")), true},
{"(foo|(ba|ar))", true, newOrFilter(containsFilter("foo"), newOrFilter(containsFilter("ba"), containsFilter("ar"))), true},
{"foo.*", true, containsFilter("foo"), true},
{".*foo", true, newNotFilter(containsFilter("foo")), false},
{".*foo.*", true, containsFilter("foo"), true},
{"(.*)(foo).*", true, containsFilter("foo"), true},
{"(foo.*|.*ba)", true, newOrFilter(containsFilter("foo"), containsFilter("ba")), true},
{"(foo.*|.*bar.*)", true, newNotFilter(newOrFilter(containsFilter("foo"), containsFilter("bar"))), false},
{".*foo.*|bar", true, newNotFilter(newOrFilter(containsFilter("foo"), containsFilter("bar"))), false},
{".*foo|bar", true, newNotFilter(newOrFilter(containsFilter("foo"), containsFilter("bar"))), false},
{"foo", true, newContainsFilter([]byte("foo"), false), true},
{"not", true, newNotFilter(newContainsFilter([]byte("not"), false)), false},
{"(foo)", true, newContainsFilter([]byte("foo"), false), true},
{"(foo|ba)", true, newOrFilter(newContainsFilter([]byte("foo"), false), newContainsFilter([]byte("ba"), false)), true},
{"(foo|ba|ar)", true, newOrFilter(newOrFilter(newContainsFilter([]byte("foo"), false), newContainsFilter([]byte("ba"), false)), newContainsFilter([]byte("ar"), false)), true},
{"(foo|(ba|ar))", true, newOrFilter(newContainsFilter([]byte("foo"), false), newOrFilter(newContainsFilter([]byte("ba"), false), newContainsFilter([]byte("ar"), false))), true},
{"foo.*", true, newContainsFilter([]byte("foo"), false), true},
{".*foo", true, newNotFilter(newContainsFilter([]byte("foo"), false)), false},
{".*foo.*", true, newContainsFilter([]byte("foo"), false), true},
{"(.*)(foo).*", true, newContainsFilter([]byte("foo"), false), true},
{"(foo.*|.*ba)", true, newOrFilter(newContainsFilter([]byte("foo"), false), newContainsFilter([]byte("ba"), false)), true},
{"(foo.*|.*bar.*)", true, newNotFilter(newOrFilter(newContainsFilter([]byte("foo"), false), newContainsFilter([]byte("bar"), false))), false},
{".*foo.*|bar", true, newNotFilter(newOrFilter(newContainsFilter([]byte("foo"), false), newContainsFilter([]byte("bar"), false))), false},
{".*foo|bar", true, newNotFilter(newOrFilter(newContainsFilter([]byte("foo"), false), newContainsFilter([]byte("bar"), false))), false},
// This construct is similar to (...), but won't create a capture group.
{"(?:.*foo.*|bar)", true, newOrFilter(containsFilter("foo"), containsFilter("bar")), true},
{"(?:.*foo.*|bar)", true, newOrFilter(newContainsFilter([]byte("foo"), false), newContainsFilter([]byte("bar"), false)), true},
// named capture group
{"(?P<foo>.*foo.*|bar)", true, newOrFilter(containsFilter("foo"), containsFilter("bar")), true},
{"(?P<foo>.*foo.*|bar)", true, newOrFilter(newContainsFilter([]byte("foo"), false), newContainsFilter([]byte("bar"), false)), true},
// parsed as (?-s:.)*foo(?-s:.)*|b(?:ar|uzz)
{".*foo.*|bar|buzz", true, newOrFilter(containsFilter("foo"), newOrFilter(containsFilter("bar"), containsFilter("buzz"))), true},
{".*foo.*|bar|buzz", true, newOrFilter(newContainsFilter([]byte("foo"), false), newOrFilter(newContainsFilter([]byte("bar"), false), newContainsFilter([]byte("buzz"), false))), true},
// parsed as (?-s:.)*foo(?-s:.)*|bar|uzz
{".*foo.*|bar|uzz", true, newOrFilter(newOrFilter(containsFilter("foo"), containsFilter("bar")), containsFilter("uzz")), true},
{".*foo.*|bar|uzz", true, newOrFilter(newOrFilter(newContainsFilter([]byte("foo"), false), newContainsFilter([]byte("bar"), false)), newContainsFilter([]byte("uzz"), false)), true},
// parsed as foo|b(?:ar|(?:)|uzz)|zz
{"foo|bar|b|buzz|zz", true, newOrFilter(newOrFilter(containsFilter("foo"), newOrFilter(newOrFilter(containsFilter("bar"), containsFilter("b")), containsFilter("buzz"))), containsFilter("zz")), true},
{"foo|bar|b|buzz|zz", true, newOrFilter(newOrFilter(newContainsFilter([]byte("foo"), false), newOrFilter(newOrFilter(newContainsFilter([]byte("bar"), false), newContainsFilter([]byte("b"), false)), newContainsFilter([]byte("buzz"), false))), newContainsFilter([]byte("zz"), false)), true},
// parsed as f(?:(?:)|oo(?:(?:)|bar))
{"f|foo|foobar", true, newOrFilter(containsFilter("f"), newOrFilter(containsFilter("foo"), containsFilter("foobar"))), true},
{"f|foo|foobar", true, newOrFilter(newContainsFilter([]byte("f"), false), newOrFilter(newContainsFilter([]byte("foo"), false), newContainsFilter([]byte("foobar"), false))), true},
// parsed as f(?:(?-s:.)*|oobar(?-s:.)*)|(?-s:.)*buzz
{"f.*|foobar.*|.*buzz", true, newOrFilter(newOrFilter(containsFilter("f"), containsFilter("foobar")), containsFilter("buzz")), true},
{"f.*|foobar.*|.*buzz", true, newOrFilter(newOrFilter(newContainsFilter([]byte("f"), false), newContainsFilter([]byte("foobar"), false)), newContainsFilter([]byte("buzz"), false)), true},
// parsed as ((f(?-s:.)*)|foobar(?-s:.)*)|(?-s:.)*buzz
{"((f.*)|foobar.*)|.*buzz", true, newOrFilter(newOrFilter(containsFilter("f"), containsFilter("foobar")), containsFilter("buzz")), true},
{"((f.*)|foobar.*)|.*buzz", true, newOrFilter(newOrFilter(newContainsFilter([]byte("f"), false), newContainsFilter([]byte("foobar"), false)), newContainsFilter([]byte("buzz"), false)), true},
{".*", true, TrueFilter, true},
{".*|.*", true, TrueFilter, true},
{".*||||", true, TrueFilter, true},
{"", true, TrueFilter, true},
{"(?i)foo", true, newContainsFilter([]byte("foo"), true), true},

// regex we are not supporting.
{"[a-z]+foo", false, nil, false},
Expand Down Expand Up @@ -93,26 +94,27 @@ func Test_SimplifiedRegex(t *testing.T) {
}

func Test_TrueFilter(t *testing.T) {
empty := []byte("")
for _, test := range []struct {
name string
f LineFilter
expectTrue bool
}{
{"empty match", newContainsFilter(""), true},
{"not empty match", newNotFilter(newContainsFilter("")), false},
{"match", newContainsFilter("foo"), false},
{"empty match and", newAndFilter(newContainsFilter(""), newContainsFilter("")), true},
{"empty match or", newOrFilter(newContainsFilter(""), newContainsFilter("")), true},
{"nil right and", newAndFilter(newContainsFilter(""), nil), true},
{"nil left or", newOrFilter(nil, newContainsFilter("")), true},
{"nil right and not empty", newAndFilter(newContainsFilter("foo"), nil), false},
{"nil left or not empty", newOrFilter(nil, newContainsFilter("foo")), false},
{"empty match", newContainsFilter(empty, false), true},
{"not empty match", newNotFilter(newContainsFilter(empty, true)), false},
{"match", newContainsFilter([]byte("foo"), false), false},
{"empty match and", newAndFilter(newContainsFilter(empty, false), newContainsFilter(empty, false)), true},
{"empty match or", newOrFilter(newContainsFilter(empty, false), newContainsFilter(empty, false)), true},
{"nil right and", newAndFilter(newContainsFilter(empty, false), nil), true},
{"nil left or", newOrFilter(nil, newContainsFilter(empty, false)), true},
{"nil right and not empty", newAndFilter(newContainsFilter([]byte("foo"), false), nil), false},
{"nil left or not empty", newOrFilter(nil, newContainsFilter([]byte("foo"), false)), false},
{"nil both and", newAndFilter(nil, nil), true},
{"nil both or", newOrFilter(nil, nil), true},
{"empty match and chained", newAndFilter(newContainsFilter(""), newAndFilter(newContainsFilter(""), newAndFilter(newContainsFilter(""), newContainsFilter("")))), true},
{"empty match or chained", newOrFilter(newContainsFilter(""), newOrFilter(newContainsFilter(""), newOrFilter(newContainsFilter(""), newContainsFilter("")))), true},
{"empty match and", newNotFilter(newAndFilter(newContainsFilter(""), newContainsFilter(""))), false},
{"empty match or", newNotFilter(newOrFilter(newContainsFilter(""), newContainsFilter(""))), false},
{"empty match and chained", newAndFilter(newContainsFilter(empty, false), newAndFilter(newContainsFilter(empty, false), newAndFilter(newContainsFilter(empty, false), newContainsFilter(empty, false)))), true},
{"empty match or chained", newOrFilter(newContainsFilter(empty, false), newOrFilter(newContainsFilter(empty, true), newOrFilter(newContainsFilter(empty, false), newContainsFilter(empty, false)))), true},
{"empty match and", newNotFilter(newAndFilter(newContainsFilter(empty, false), newContainsFilter(empty, false))), false},
{"empty match or", newNotFilter(newOrFilter(newContainsFilter(empty, false), newContainsFilter(empty, false))), false},
} {
t.Run(test.name, func(t *testing.T) {
if test.expectTrue {
Expand All @@ -131,6 +133,7 @@ func Benchmark_LineFilter(b *testing.B) {
re string
}{
{"foo.*"},
{"(?i)foo"},
{".*foo.*"},
{".*foo"},
{"foo|bar"},
Expand All @@ -147,7 +150,7 @@ func Benchmark_LineFilter(b *testing.B) {
}

// see https://dave.cheney.net/2013/06/30/how-to-write-benchmarks-in-go
// A note on compiler optimisations
// A note on compiler optimizations
var res bool

func benchmarkRegex(b *testing.B, re, line string, match bool) {
Expand Down

0 comments on commit 15c4ebb

Please sign in to comment.