diff --git a/libbeat/common/match/cmp.go b/libbeat/common/match/cmp.go new file mode 100644 index 00000000000..cabf800b7b4 --- /dev/null +++ b/libbeat/common/match/cmp.go @@ -0,0 +1,257 @@ +package match + +import "regexp/syntax" + +// common predefined patterns +var ( + patDotStar = mustParse(`.*`) + patNullBeginDotStar = mustParse(`^.*`) + patNullEndDotStar = mustParse(`.*$`) + + patEmptyText = mustParse(`^$`) + patEmptyWhiteText = mustParse(`^\s*$`) + + // patterns matching any content + patAny1 = patDotStar + patAny2 = mustParse(`^.*`) + patAny3 = mustParse(`^.*$`) + patAny4 = mustParse(`.*$`) + + patBeginText = mustParse(`^`) + patEndText = mustParse(`$`) + + patDigits = mustParse(`\d`) +) + +// isPrefixLiteral checks regular expression being literal checking string +// starting with literal pattern (like '^PATTERN') +func isPrefixLiteral(r *syntax.Regexp) bool { + return r.Op == syntax.OpConcat && + len(r.Sub) == 2 && + r.Sub[0].Op == syntax.OpBeginText && + r.Sub[1].Op == syntax.OpLiteral +} + +func isAltLiterals(r *syntax.Regexp) bool { + if r.Op != syntax.OpAlternate { + return false + } + + for _, sub := range r.Sub { + if sub.Op != syntax.OpLiteral { + return false + } + } + return true +} + +func isExactLiteral(r *syntax.Regexp) bool { + return r.Op == syntax.OpConcat && + len(r.Sub) == 3 && + r.Sub[0].Op == syntax.OpBeginText && + r.Sub[1].Op == syntax.OpLiteral && + r.Sub[2].Op == syntax.OpEndText +} + +func isOneOfLiterals(r *syntax.Regexp) bool { + return r.Op == syntax.OpConcat && + len(r.Sub) == 3 && + r.Sub[0].Op == syntax.OpBeginText && + isAltLiterals(r.Sub[1]) && + r.Sub[2].Op == syntax.OpEndText +} + +// isPrefixAltLiterals checks regular expression being alternative literals +// starting with literal pattern (like '^PATTERN') +func isPrefixAltLiterals(r *syntax.Regexp) bool { + isPrefixAlt := r.Op == syntax.OpConcat && + len(r.Sub) == 2 && + r.Sub[0].Op == syntax.OpBeginText && + r.Sub[1].Op == syntax.OpAlternate + if !isPrefixAlt { + return false + } + + for _, sub := range r.Sub[1].Sub { + if sub.Op != syntax.OpLiteral { + return false + } + } + return true +} + +func isPrefixNumDate(r *syntax.Regexp) bool { + if r.Op != syntax.OpConcat || r.Sub[0].Op != syntax.OpBeginText { + return false + } + + i := 1 + if r.Sub[i].Op == syntax.OpLiteral { + i++ + } + + // check digits + if !isMultiDigits(r.Sub[i]) { + return false + } + i++ + + for i < len(r.Sub) { + // check separator + if r.Sub[i].Op != syntax.OpLiteral { + return false + } + i++ + + // check digits + if !isMultiDigits(r.Sub[i]) { + return false + } + i++ + } + + return true +} + +// isdotStar checks the term being `.*`. +func isdotStar(r *syntax.Regexp) bool { + return eqRegex(r, patDotStar) +} + +func isEmptyText(r *syntax.Regexp) bool { + return eqRegex(r, patEmptyText) +} + +func isEmptyTextWithWhitespace(r *syntax.Regexp) bool { + return eqRegex(r, patEmptyWhiteText) +} + +func isAnyMatch(r *syntax.Regexp) bool { + return eqRegex(r, patAny1) || + eqRegex(r, patAny2) || + eqRegex(r, patAny3) || + eqRegex(r, patAny4) +} + +func isDigitMatch(r *syntax.Regexp) bool { + return eqRegex(r, patDigits) +} + +func isMultiDigits(r *syntax.Regexp) bool { + return isConcatRepetition(r) && isDigitMatch(r.Sub[0]) +} + +func isConcatRepetition(r *syntax.Regexp) bool { + if r.Op != syntax.OpConcat { + return false + } + + first := r.Sub[0] + for _, other := range r.Sub { + if other != first { // concat repetitions reuse references => compare pointers + return false + } + } + + return true +} + +func eqRegex(r, proto *syntax.Regexp) bool { + unmatchable := r.Op != proto.Op || r.Flags != proto.Flags || + (r.Min != proto.Min) || (r.Max != proto.Max) || + (len(r.Sub) != len(proto.Sub)) || + (len(r.Rune) != len(proto.Rune)) + + if unmatchable { + return false + } + + for i := range r.Sub { + if !eqRegex(r.Sub[i], proto.Sub[i]) { + return false + } + } + + for i := range r.Rune { + if r.Rune[i] != proto.Rune[i] { + return false + } + } + return true +} + +func eqPrefixAnyRegex(r *syntax.Regexp, protos ...*syntax.Regexp) bool { + for _, proto := range protos { + if eqPrefixRegex(r, proto) { + return true + } + } + return false +} + +func eqPrefixRegex(r, proto *syntax.Regexp) bool { + if r.Op != syntax.OpConcat { + return false + } + + if proto.Op != syntax.OpConcat { + if len(r.Sub) == 0 { + return false + } + return eqRegex(r.Sub[0], proto) + } + + if len(r.Sub) < len(proto.Sub) { + return false + } + + for i := range proto.Sub { + if !eqRegex(r.Sub[i], proto.Sub[i]) { + return false + } + } + return true +} + +func eqSuffixAnyRegex(r *syntax.Regexp, protos ...*syntax.Regexp) bool { + for _, proto := range protos { + if eqSuffixRegex(r, proto) { + return true + } + } + return false +} + +func eqSuffixRegex(r, proto *syntax.Regexp) bool { + if r.Op != syntax.OpConcat { + return false + } + + if proto.Op != syntax.OpConcat { + i := len(r.Sub) - 1 + if i < 0 { + return false + } + return eqRegex(r.Sub[i], proto) + } + + if len(r.Sub) < len(proto.Sub) { + return false + } + + d := len(r.Sub) - len(proto.Sub) + for i := range proto.Sub { + if !eqRegex(r.Sub[d+i], proto.Sub[i]) { + return false + } + } + return true +} + +func mustParse(pattern string) *syntax.Regexp { + r, err := syntax.Parse(pattern, syntax.Perl) + if err != nil { + panic(err) + } + return r +} diff --git a/libbeat/common/match/compile.go b/libbeat/common/match/compile.go new file mode 100644 index 00000000000..dc0b663b039 --- /dev/null +++ b/libbeat/common/match/compile.go @@ -0,0 +1,111 @@ +package match + +import ( + "regexp" + "regexp/syntax" +) + +func compile(r *syntax.Regexp) (stringMatcher, error) { + switch { + case r.Op == syntax.OpLiteral: + s := string(r.Rune) + return &substringMatcher{s, []byte(s)}, nil + + case isExactLiteral(r): + s := string(r.Sub[1].Rune) + return &equalsMatcher{s, []byte(s)}, nil + + case isAltLiterals(r): + var literals [][]byte + for _, sub := range r.Sub { + literals = append(literals, []byte(string(sub.Rune))) + } + return &altSubstringMatcher{literals}, nil + + case isOneOfLiterals(r): + var literals [][]byte + for _, sub := range r.Sub[1].Sub { + literals = append(literals, []byte(string(sub.Rune))) + } + return &oneOfMatcher{literals}, nil + + case isPrefixLiteral(r): + s := []byte(string(r.Sub[1].Rune)) + return &prefixMatcher{s}, nil + + case isPrefixAltLiterals(r): + var literals [][]byte + for _, sub := range r.Sub[1].Sub { + literals = append(literals, []byte(string(sub.Rune))) + } + return &altPrefixMatcher{literals}, nil + + case isPrefixNumDate(r): + return compilePrefixNumDate(r) + + case isEmptyText(r): + var m *emptyStringMatcher + return m, nil + + case isEmptyTextWithWhitespace(r): + var m *emptyWhiteStringMatcher + return m, nil + + case isAnyMatch(r): + var m *matchAny + return m, nil + + default: + + r, err := regexp.Compile(r.String()) + if err != nil { + return nil, err + } + return r, nil + } +} + +func compilePrefixNumDate(r *syntax.Regexp) (stringMatcher, error) { + m := &prefixNumDate{} + + i := 1 + if r.Sub[i].Op == syntax.OpLiteral { + m.prefix = []byte(string(r.Sub[i].Rune)) + i++ + } + + digitLen := func(r *syntax.Regexp) int { + if r.Op == syntax.OpConcat { + return len(r.Sub) + } + return 1 + } + + var digits []int + var seps [][]byte + + digits = append(digits, digitLen(r.Sub[i])) + i++ + + for i < len(r.Sub) { + seps = append(seps, []byte(string(r.Sub[i].Rune))) + i++ + + digits = append(digits, digitLen(r.Sub[i])) + i++ + } + + minLen := len(m.prefix) + for _, d := range digits { + minLen += d + } + for _, sep := range seps { + minLen += len(sep) + } + + m.digits = digits + m.seps = seps + m.minLen = minLen + + return m, nil +} diff --git a/libbeat/common/match/matcher.go b/libbeat/common/match/matcher.go new file mode 100644 index 00000000000..b848e198135 --- /dev/null +++ b/libbeat/common/match/matcher.go @@ -0,0 +1,104 @@ +package match + +import "regexp/syntax" + +type Matcher struct { + stringMatcher +} + +type ExactMatcher struct { + stringMatcher +} + +type stringMatcher interface { + // MatchString tries to find a matching substring. + MatchString(s string) (matched bool) + + // Match tries to find a matching substring. + Match(bs []byte) (matched bool) + + // Describe the generator + String() string +} + +func MustCompile(pattern string) Matcher { + m, err := Compile(pattern) + if err != nil { + panic(err) + } + return m +} + +func MustCompileExact(pattern string) ExactMatcher { + m, err := CompileExact(pattern) + if err != nil { + panic(err) + } + return m +} + +// Compile regular expression to string matcher. String matcher by default uses +// regular expressions as provided by regexp library, but tries to optimize some +// common cases, replacing expensive patterns with cheaper custom implementations +// or removing terms not necessary for string matching. +func Compile(pattern string) (Matcher, error) { + regex, err := syntax.Parse(pattern, syntax.Perl) + if err != nil { + return Matcher{}, err + } + + regex = optimize(regex).Simplify() + m, err := compile(regex) + return Matcher{m}, err +} + +func CompileExact(pattern string) (ExactMatcher, error) { + regex, err := syntax.Parse(pattern, syntax.Perl) + if err != nil { + return ExactMatcher{}, err + } + + regex = regex.Simplify() + if regex.Op != syntax.OpConcat { + regex = &syntax.Regexp{ + Op: syntax.OpConcat, + Sub: []*syntax.Regexp{ + patBeginText, + regex, + patEndText, + }, + Flags: regex.Flags, + } + } else { + if !eqPrefixRegex(regex, patBeginText) { + regex.Sub = append([]*syntax.Regexp{patBeginText}, regex.Sub...) + } + if !eqSuffixRegex(regex, patEndText) { + regex.Sub = append(regex.Sub, patEndText) + } + } + + regex = optimize(regex).Simplify() + m, err := compile(regex) + return ExactMatcher{m}, err +} + +func (m *Matcher) Unpack(s string) error { + tmp, err := Compile(s) + if err != nil { + return err + } + + *m = tmp + return nil +} + +func (m *ExactMatcher) Unpack(s string) error { + tmp, err := CompileExact(s) + if err != nil { + return err + } + + *m = tmp + return nil +} diff --git a/libbeat/common/match/matcher_bench_test.go b/libbeat/common/match/matcher_bench_test.go new file mode 100644 index 00000000000..41f3d408f1a --- /dev/null +++ b/libbeat/common/match/matcher_bench_test.go @@ -0,0 +1,144 @@ +package match + +import ( + "bytes" + "fmt" + "regexp" + "testing" +) + +type testContent struct { + name string + lines [][]byte +} + +type benchRunner struct { + title string + f func(*testing.B) +} + +var allContents = []testContent{ + mixedContent, + logContent, + logContent2, + logContentLevel, +} + +var mixedContent = makeContent("mixed", `Lorem ipsum dolor sit amet, +PATTERN consectetur adipiscing elit. Nam vitae turpis augue. + Quisque euismod erat tortor, posuere auctor elit fermentum vel. Proin in odio + +23-08-2016 eleifend, maximus turpis non, lacinia ligula. Nullam vel pharetra quam, id egestas + +massa. Sed a vestibulum libero. Sed tellus lorem, imperdiet non nisl ac, + aliquet placerat magna. Sed PATTERN in bibendum eros. Curabitur ut pretium neque. Sed +23-08-2016 egestas elit et leo consectetur, nec dignissim arcu ultricies. Sed molestie tempor + +erat, a maximus sapien rutrum ut. Curabitur congue condimentum dignissim. + Mauris hendrerit, velit nec accumsan egestas, augue justo tincidunt risus, + +a facilisis nulla augue PATTERN eu metus. Duis vel neque sit amet nunc elementum viverra +eu ut ligula. Mauris et libero lacus.`) + +var logContent = makeContent("simple_log", `23-08-2016 15:10:01 - Lorem ipsum dolor sit amet, +23-08-2016 15:10:02 - PATTERN consectetur adipiscing elit. Nam vitae turpis augue. +23-08-2016 15:10:03 - Quisque euismod erat tortor, posuere auctor elit fermentum vel. Proin in odio +23-08-2016 15:10:05 - 23-08-2016 eleifend, maximus turpis non, lacinia ligula. Nullam vel pharetra quam, id egestas +23-08-2016 15:10:07 - massa. Sed a vestibulum libero. Sed tellus lorem, imperdiet non nisl ac, +23-08-2016 15:10:08 - aliquet placerat magna. Sed PATTERN in bibendum eros. Curabitur ut pretium neque. Sed +23-08-2016 15:10:09 - 23-08-2016 egestas elit et leo consectetur, nec dignissim arcu ultricies. Sed molestie tempor +23-08-2016 15:10:11 - erat, a maximus sapien rutrum ut. Curabitur congue condimentum dignissim. +23-08-2016 15:10:12 - Mauris hendrerit, velit nec accumsan egestas, augue justo tincidunt risus, +23-08-2016 15:10:14 - a facilisis nulla augue PATTERN eu metus. Duis vel neque sit amet nunc elementum viverra +eu ut ligula. Mauris et libero lacus. +`) + +var logContent2 = makeContent("simple_log2", `2016-08-23 15:10:01 - DEBUG - Lorem ipsum dolor sit amet, +2016-08-23 15:10:02 - INFO - PATTERN consectetur adipiscing elit. Nam vitae turpis augue. +2016-08-23 15:10:03 - DEBUG - Quisque euismod erat tortor, posuere auctor elit fermentum vel. Proin in odio +2016-08-23 15:10:05 - ERROR - 23-08-2016 eleifend, maximus turpis non, lacinia ligula. Nullam vel pharetra quam, id egestas +2016-08-23 15:10:07 - WARN - massa. Sed a vestibulum libero. Sed tellus lorem, imperdiet non nisl ac, +2016-08-23 15:10:08 - CRIT - aliquet placerat magna. Sed PATTERN in bibendum eros. Curabitur ut pretium neque. Sed +2016-08-23 15:10:09 - DEBUG - 23-08-2016 egestas elit et leo consectetur, nec dignissim arcu ultricies. Sed molestie tempor +2016-08-23 15:10:11 - ERROR - erat, a maximus sapien rutrum ut. Curabitur congue condimentum dignissim. +2016-08-23 15:10:12 - INFO - Mauris hendrerit, velit nec accumsan egestas, augue justo tincidunt risus, +2016-08-23 15:10:14 - INFO - a facilisis nulla augue PATTERN eu metus. Duis vel neque sit amet nunc elementum viverra eu ut ligula. Mauris et libero lacus. +`) + +var logContentLevel = makeContent("simple_log_with_level", `DEBUG - 2016-08-23 15:10:01 - Lorem ipsum dolor sit amet, +INFO - 2016-08-23 15:10:02 - PATTERN consectetur adipiscing elit. Nam vitae turpis augue. +DEBUG - 2016-08-23 15:10:03 - Quisque euismod erat tortor, posuere auctor elit fermentum vel. Proin in odio +ERROR - 2016-08-23 15:10:05 - 23-08-2016 eleifend, maximus turpis non, lacinia ligula. Nullam vel pharetra quam, id egestas +WARN - 2016-08-23 15:10:07 - massa. Sed a vestibulum libero. Sed tellus lorem, imperdiet non nisl ac, +CRIT - 2016-08-23 15:10:08 - aliquet placerat magna. Sed PATTERN in bibendum eros. Curabitur ut pretium neque. Sed +DEBUG - 2016-08-23 15:10:09 - 23-08-2016 egestas elit et leo consectetur, nec dignissim arcu ultricies. Sed molestie tempor +ERROR -2016-08-23 15:10:11 - erat, a maximus sapien rutrum ut. Curabitur congue condimentum dignissim. +DEBUG - 2016-08-23 15:10:12 - Mauris hendrerit, velit nec accumsan egestas, augue justo tincidunt risus, +INFO - 2016-08-23 15:10:14 - a facilisis nulla augue PATTERN eu metus. Duis vel neque sit amet nunc elementum viverra +eu ut ligula. Mauris et libero lacus. +`) + +func BenchmarkPatterns(b *testing.B) { + patterns := []struct { + title string + regex string + }{ + {"match any 1", `^.*$`}, + {"match any 2", `.*`}, + {"startsWith 'PATTERN'", `^PATTERN`}, + {"startsWith ' '", `^ `}, + {"startsWithDate", `^\d{2}-\d{2}-\d{4}`}, + {"startsWithDate2", `^\d{4}-\d{2}-\d{2}`}, + {"startsWithDate3", `^20\d{2}-\d{2}-\d{2}`}, + {"startsWithLevel", `^(DEBUG|INFO|WARN|ERR|CRIT)`}, + {"hasLevel", `(DEBUG|INFO|WARN|ERR|CRIT)`}, + {"contains 'PATTERN'", `PATTERN`}, + {"contains 'PATTERN' with '.*", `.*PATTERN.*`}, + {"empty line", `^$`}, + {"empty line with optional whitespace", `^\s*$`}, + } + + runTitle := func(matcher, name, content string) string { + return fmt.Sprintf("Name=%v, Matcher=%v, Content=%v", name, matcher, content) + } + + for i, pattern := range patterns { + b.Logf("benchmark (%v): %v", i, pattern.title) + + regex := regexp.MustCompile(pattern.regex) + matcher := MustCompile(pattern.regex) + + b.Logf("regex: %v", regex) + b.Logf("matcher: %v", matcher) + + for _, content := range allContents { + title := runTitle("Regex", pattern.title, content.name) + runner := makeRunner(title, content.lines, regex.Match) + b.Run(runner.title, runner.f) + + title = runTitle("Match", pattern.title, content.name) + runner = makeRunner(title, content.lines, matcher.Match) + b.Run(runner.title, runner.f) + } + } +} + +func makeRunner(title string, content [][]byte, m func([]byte) bool) benchRunner { + return benchRunner{ + title, + func(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, line := range content { + m(line) + } + } + }, + } +} + +func makeContent(name, s string) testContent { + return testContent{ + name, + bytes.Split([]byte(s), []byte("\n")), + } +} diff --git a/libbeat/common/match/matcher_test.go b/libbeat/common/match/matcher_test.go new file mode 100644 index 00000000000..9e15b1eca56 --- /dev/null +++ b/libbeat/common/match/matcher_test.go @@ -0,0 +1,349 @@ +package match + +import ( + "reflect" + "testing" +) + +func TestMatchers(t *testing.T) { + typeOf := func(v interface{}) reflect.Type { + return reflect.TypeOf(v) + } + + tests := []struct { + pattern string + matcherType reflect.Type + matches []string + noMatches []string + }{ + { + `.*`, + typeOf((*matchAny)(nil)), + []string{ + "any matches always", + }, + nil, + }, + { + `^$`, + typeOf((*emptyStringMatcher)(nil)), + []string{""}, + []string{"not empty"}, + }, + { + `^\s*$`, + typeOf((*emptyWhiteStringMatcher)(nil)), + []string{"", " ", " ", "\t", "\n"}, + []string{"not empty"}, + }, + { + `substring`, + typeOf((*substringMatcher)(nil)), + []string{ + "has substring in middle", + "substring at beginning", + "ends with substring", + }, + []string{"missing sub-string"}, + }, + { + `^.*substring`, + typeOf((*substringMatcher)(nil)), + []string{ + "has substring in middle", + "substring at beginning", + "ends with substring", + }, + []string{"missing sub-string"}, + }, + { + `substring.*$`, + typeOf((*substringMatcher)(nil)), + []string{ + "has substring in middle", + "substring at beginning", + "ends with substring", + }, + []string{"missing sub-string"}, + }, + { + `^.*substring.*$`, + typeOf((*substringMatcher)(nil)), + []string{ + "has substring in middle", + "substring at beginning", + "ends with substring", + }, + []string{"missing sub-string"}, + }, + { + `^equals$`, + typeOf((*equalsMatcher)(nil)), + []string{"equals"}, + []string{"not equals"}, + }, + { + `(alt|substring)`, + typeOf((*altSubstringMatcher)(nil)), + []string{ + "has alt in middle", + "alt at beginning", + "uses substring", + }, + []string{"missing sub-string"}, + }, + { + `alt|substring`, + typeOf((*altSubstringMatcher)(nil)), + []string{ + "has alt in middle", + "alt at beginning", + "uses substring", + }, + []string{"missing sub-string"}, + }, + { + `^prefix`, + typeOf((*prefixMatcher)(nil)), + []string{"prefix string match"}, + []string{"missing prefix string"}, + }, + { + `^(DEBUG|INFO|ERROR)`, + typeOf((*altPrefixMatcher)(nil)), + []string{ + "DEBUG - should match", + "INFO - should match too", + "ERROR - yep", + }, + []string{ + "This should not match", + }, + }, + { + `^\d{4}-\d{2}-\d{2}`, + typeOf((*prefixNumDate)(nil)), + []string{ + "2017-01-02 should match", + "2017-01-03 should also match", + }, + []string{ + "- 2017-01-02 should not match", + "fail", + }, + }, + { + `^20\d{2}-\d{2}-\d{2}`, + typeOf((*prefixNumDate)(nil)), + []string{ + "2017-01-02 should match", + "2017-01-03 should also match", + }, + []string{ + "- 2017-01-02 should not match", + "fail", + }, + }, + { + `^20\d{2}-\d{2}-\d{2} \d{2}:\d{2}`, + typeOf((*prefixNumDate)(nil)), + []string{ + "2017-01-02 10:10 should match", + "2017-01-03 10:11 should also match", + }, + []string{ + "- 2017-01-02 should not match", + "fail", + }, + }, + } + + for i, test := range tests { + t.Logf("run test (%v): %v", i, test.pattern) + + matcher, err := Compile(test.pattern) + if err != nil { + t.Error(err) + continue + } + + t.Logf(" matcher: %v", matcher) + + matcherType := reflect.TypeOf(matcher.stringMatcher) + if matcherType != test.matcherType { + t.Errorf(" Matcher type mismatch (expected=%v, actual=%v)", + test.matcherType, + matcherType, + ) + } + + for _, content := range test.matches { + if !matcher.MatchString(content) { + t.Errorf(" failed to match string: '%v'", content) + continue + } + + if !matcher.Match([]byte(content)) { + t.Errorf(" failed to match byte string: '%v'", content) + continue + } + } + + for _, content := range test.noMatches { + if matcher.MatchString(content) { + t.Errorf(" should not match string: '%v'", content) + continue + } + + if matcher.Match([]byte(content)) { + t.Errorf(" should not match string: '%v'", content) + continue + } + } + } +} + +func TestExactMatchers(t *testing.T) { + typeOf := func(v interface{}) reflect.Type { + return reflect.TypeOf(v) + } + + tests := []struct { + pattern string + matcherType reflect.Type + matches []string + noMatches []string + }{ + { + `.*`, + typeOf((*matchAny)(nil)), + []string{ + "any matches always", + }, + nil, + }, + { + `^$`, + typeOf((*emptyStringMatcher)(nil)), + []string{""}, + []string{"not empty"}, + }, + { + `^\s*$`, + typeOf((*emptyWhiteStringMatcher)(nil)), + []string{"", " ", " ", "\t", "\n"}, + []string{"not empty"}, + }, + { + `.*substring.*`, + typeOf((*substringMatcher)(nil)), + []string{ + "has substring in middle", + "substring at beginning", + "ends with substring", + }, + []string{"missing sub-string"}, + }, + { + `^.*substring.*`, + typeOf((*substringMatcher)(nil)), + []string{ + "has substring in middle", + "substring at beginning", + "ends with substring", + }, + []string{"missing sub-string"}, + }, + { + `.*substring.*$`, + typeOf((*substringMatcher)(nil)), + []string{ + "has substring in middle", + "substring at beginning", + "ends with substring", + }, + []string{"missing sub-string"}, + }, + { + `^.*substring.*$`, + typeOf((*substringMatcher)(nil)), + []string{ + "has substring in middle", + "substring at beginning", + "ends with substring", + }, + []string{"missing sub-string"}, + }, + { + `equals`, + typeOf((*equalsMatcher)(nil)), + []string{"equals"}, + []string{"not equals"}, + }, + { + `^equals`, + typeOf((*equalsMatcher)(nil)), + []string{"equals"}, + []string{"not equals"}, + }, + { + `equals$`, + typeOf((*equalsMatcher)(nil)), + []string{"equals"}, + []string{"not equals"}, + }, + { + `DEBUG|INFO`, + typeOf((*oneOfMatcher)(nil)), + []string{ + "DEBUG", + "INFO", + }, + []string{"none"}, + }, + } + + for i, test := range tests { + t.Logf("run test (%v): %v", i, test.pattern) + + matcher, err := CompileExact(test.pattern) + if err != nil { + t.Error(err) + continue + } + + t.Logf(" matcher: %v", matcher) + + matcherType := reflect.TypeOf(matcher.stringMatcher) + if matcherType != test.matcherType { + t.Errorf(" Matcher type mismatch (expected=%v, actual=%v)", + test.matcherType, + matcherType, + ) + } + + for _, content := range test.matches { + if !matcher.MatchString(content) { + t.Errorf(" failed to match string: '%v'", content) + continue + } + + if !matcher.Match([]byte(content)) { + t.Errorf(" failed to match byte string: '%v'", content) + continue + } + } + + for _, content := range test.noMatches { + if matcher.MatchString(content) { + t.Errorf(" should not match string: '%v'", content) + continue + } + + if matcher.Match([]byte(content)) { + t.Errorf(" should not match string: '%v'", content) + continue + } + } + } +} diff --git a/libbeat/common/match/matchers.go b/libbeat/common/match/matchers.go new file mode 100644 index 00000000000..7ec6813b883 --- /dev/null +++ b/libbeat/common/match/matchers.go @@ -0,0 +1,240 @@ +package match + +import ( + "bytes" + "fmt" + "reflect" + "strings" + "unsafe" +) + +type equalsMatcher struct { + s string + bs []byte +} + +type substringMatcher struct { + s string + bs []byte +} + +type altSubstringMatcher struct { + literals [][]byte +} + +type oneOfMatcher struct { + literals [][]byte +} + +type prefixMatcher struct { + s []byte +} + +type altPrefixMatcher struct { + literals [][]byte +} + +type prefixNumDate struct { + minLen int + digits []int + prefix []byte + seps [][]byte +} + +type emptyStringMatcher struct{} + +type emptyWhiteStringMatcher struct{} + +type matchAny struct{} + +func (m *equalsMatcher) MatchString(s string) bool { + return m.s == s +} + +func (m *equalsMatcher) Match(bs []byte) bool { + return bytes.Equal(bs, m.bs) +} + +func (m *equalsMatcher) String() string { + return fmt.Sprintf("", m.s) +} + +func (m *substringMatcher) MatchString(s string) bool { + return strings.Contains(s, m.s) +} + +func (m *substringMatcher) Match(bs []byte) bool { + return bytes.Contains(bs, m.bs) +} + +func (m *substringMatcher) String() string { + return fmt.Sprintf("", m.s) +} + +func (m *altSubstringMatcher) MatchString(s string) bool { + return m.Match(stringToBytes(s)) +} + +func (m *altSubstringMatcher) Match(in []byte) bool { + for _, literal := range m.literals { + if bytes.Contains(in, literal) { + return true + } + } + return false +} + +func (m *altSubstringMatcher) String() string { + return fmt.Sprintf("", bytes.Join(m.literals, []byte(","))) +} + +func (m *oneOfMatcher) MatchString(s string) bool { + return m.Match(stringToBytes(s)) +} + +func (m *oneOfMatcher) Match(in []byte) bool { + for _, literal := range m.literals { + if bytes.Equal(in, literal) { + return true + } + } + return false +} + +func (m *oneOfMatcher) String() string { + return fmt.Sprintf("", bytes.Join(m.literals, []byte(","))) +} + +func (m *prefixMatcher) MatchString(s string) bool { + return len(s) >= len(m.s) && s[0:len(m.s)] == string(m.s) +} + +func (m *prefixMatcher) Match(bs []byte) bool { + return len(bs) >= len(m.s) && bytes.Equal(bs[0:len(m.s)], m.s) +} + +func (m *prefixMatcher) String() string { + return fmt.Sprintf("", string(m.s)) +} + +func (m *altPrefixMatcher) MatchString(in string) bool { + for _, s := range m.literals { + if len(in) >= len(s) && in[0:len(s)] == string(s) { + return true + } + } + return false +} + +func (m *altPrefixMatcher) Match(bs []byte) bool { + for _, s := range m.literals { + if len(bs) >= len(s) && bytes.Equal(bs[0:len(s)], s) { + return true + } + } + return false +} + +func (m *altPrefixMatcher) String() string { + return fmt.Sprintf("", bytes.Join(m.literals, []byte(","))) +} + +func (m *prefixNumDate) MatchString(in string) bool { + return m.Match(stringToBytes(in)) +} + +func (m *prefixNumDate) Match(in []byte) bool { + if len(in) < m.minLen { + return false + } + + pos := 0 + if m.prefix != nil { + end := len(m.prefix) + if !bytes.Equal(in[0:end], m.prefix) { + return false + } + + pos += end + } + + for cnt := m.digits[0]; cnt > 0; cnt-- { + v := in[pos] + pos++ + if !('0' <= v && v <= '9') { + return false + } + } + + for i := 1; i < len(m.digits); i++ { + sep := m.seps[i-1] + if !bytes.Equal(in[pos:pos+len(sep)], sep) { + return false + } + + pos += len(sep) + for cnt := m.digits[i]; cnt > 0; cnt-- { + v := in[pos] + pos++ + if !('0' <= v && v <= '9') { + return false + } + } + } + + return true +} + +func (m *prefixNumDate) String() string { + return "" +} + +func (m *emptyStringMatcher) MatchString(s string) bool { + return len(s) == 0 +} + +func (m *emptyStringMatcher) Match(bs []byte) bool { + return len(bs) == 0 +} + +func (m *emptyStringMatcher) String() string { + return "" +} + +func (m *emptyWhiteStringMatcher) MatchString(s string) bool { + for _, r := range s { + if !(r == 0x9 || r == 0xa || r == 0xc || r == 0xd || r == 0x20 || r == '\t') { + return false + } + } + return true +} + +func (m *emptyWhiteStringMatcher) Match(bs []byte) bool { + for _, r := range bytesToString(bs) { + if !(r == 0x9 || r == 0xa || r == 0xc || r == 0xd || r == 0x20 || r == '\t') { + return false + } + } + return true +} + +func (m *emptyWhiteStringMatcher) String() string { + return "" +} + +func (m *matchAny) Match(_ []byte) bool { return true } +func (m *matchAny) MatchString(_ string) bool { return true } +func (m *matchAny) String() string { return "" } + +func bytesToString(b []byte) string { + bh := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + sh := reflect.StringHeader{Data: bh.Data, Len: bh.Len} + return *(*string)(unsafe.Pointer(&sh)) +} + +func stringToBytes(s string) []byte { + sh := (*reflect.StringHeader)(unsafe.Pointer(&s)) + bh := reflect.SliceHeader{Data: sh.Data, Len: sh.Len, Cap: sh.Len} + return *(*[]byte)(unsafe.Pointer(&bh)) +} diff --git a/libbeat/common/match/optimize.go b/libbeat/common/match/optimize.go new file mode 100644 index 00000000000..cb485c63397 --- /dev/null +++ b/libbeat/common/match/optimize.go @@ -0,0 +1,206 @@ +package match + +import "regexp/syntax" + +type trans func(*syntax.Regexp) (bool, *syntax.Regexp) + +var transformations = []trans{ + simplify, + uncapture, + trimLeft, + trimRight, + unconcat, + concatRepetition, +} + +// optimize runs minimal regular expression optimizations +// until fix-point. +func optimize(r *syntax.Regexp) *syntax.Regexp { + for { + changed := false + for _, t := range transformations { + var upd bool + upd, r = t(r) + changed = changed || upd + } + + if changed == false { + return r + } + } +} + +// Simplify regular expression by stdlib. +func simplify(r *syntax.Regexp) (bool, *syntax.Regexp) { + return false, r.Simplify() +} + +// uncapture optimizes regular expression by removing capture groups from +// regular expression potentially allocating memory when executed. +func uncapture(r *syntax.Regexp) (bool, *syntax.Regexp) { + if r.Op == syntax.OpCapture { + // try to uncapture + if len(r.Sub) == 1 { + _, sub := uncapture(r.Sub[0]) + return true, sub + } + + tmp := *r + tmp.Op = syntax.OpConcat + r = &tmp + } + + sub := make([]*syntax.Regexp, len(r.Sub)) + modified := false + for i := range r.Sub { + var m bool + m, sub[i] = uncapture(r.Sub[i]) + modified = modified || m + } + + if !modified { + return false, r + } + + tmp := *r + tmp.Sub = sub + return true, &tmp +} + +// trimLeft removes not required '.*' from beginning of regular expressions. +func trimLeft(r *syntax.Regexp) (bool, *syntax.Regexp) { + if eqPrefixAnyRegex(r, patDotStar, patNullBeginDotStar) { + tmp := *r + tmp.Sub = tmp.Sub[1:] + return true, &tmp + } + + return false, r +} + +// trimRight removes not required '.*' from end of regular expressions. +func trimRight(r *syntax.Regexp) (bool, *syntax.Regexp) { + if eqSuffixAnyRegex(r, patDotStar, patNullEndDotStar) { + i := len(r.Sub) - 1 + tmp := *r + tmp.Sub = tmp.Sub[0:i] + return true, &tmp + } + + return false, r +} + +// unconcat removes intermediate regular expression concatenations generated by +// parser if concatenation contains only 1 element. Removal of object from +// parse-tree can enable other optimization to fire. +func unconcat(r *syntax.Regexp) (bool, *syntax.Regexp) { + switch { + case r.Op == syntax.OpConcat && len(r.Sub) <= 1: + if len(r.Sub) == 1 { + return true, r.Sub[0] + } + + return true, &syntax.Regexp{ + Op: syntax.OpEmptyMatch, + Flags: r.Flags, + } + + case r.Op == syntax.OpRepeat && r.Min == r.Max && r.Min == 1: + return true, r.Sub[0] + } + + return false, r +} + +// concatRepetition concatenates multiple repeated sub-patterns into +// a repetition of exactly N. +func concatRepetition(r *syntax.Regexp) (bool, *syntax.Regexp) { + + if r.Op != syntax.OpConcat { + // don't iterate sub-expressions if top-level is no OpConcat + return false, r + } + + // check if concatenated op is already a repetition + if isConcatRepetition(r) { + return false, r + } + + // concatenate repetitions in sub-expressions first + var subs []*syntax.Regexp + changed := false + for _, sub := range r.Sub { + changedSub, tmp := concatRepetition(sub) + changed = changed || changedSub + subs = append(subs, tmp) + } + + var concat []*syntax.Regexp + lastMerged := -1 + for i, j := 0, 1; j < len(subs); i, j = j, j+1 { + if subs[i].Op == syntax.OpRepeat && eqRegex(subs[i].Sub[0], subs[j]) { + r := subs[i] + concat = append(concat, + &syntax.Regexp{ + Op: syntax.OpRepeat, + Sub: r.Sub, + Min: r.Min + 1, + Max: r.Max + 1, + Flags: r.Flags, + }, + ) + + lastMerged = j + changed = true + j++ + continue + } + + if isConcatRepetition(subs[i]) && eqRegex(subs[i].Sub[0], subs[j]) { + r := subs[i] + concat = append(concat, + &syntax.Regexp{ + Op: syntax.OpConcat, + Sub: append(r.Sub, r.Sub[0]), + Flags: r.Flags, + }, + ) + + lastMerged = j + changed = true + j++ + continue + } + + if eqRegex(subs[i], subs[j]) { + r := subs[i] + concat = append(concat, + &syntax.Regexp{ + Op: syntax.OpRepeat, + Sub: []*syntax.Regexp{r}, + Min: 2, + Max: 2, + Flags: r.Flags, + }, + ) + + lastMerged = j + changed = true + j++ + continue + } + + concat = append(concat, subs[i]) + } + + if lastMerged+1 != len(subs) { + concat = append(concat, subs[len(subs)-1]) + } + + r = &syntax.Regexp{ + Op: syntax.OpConcat, + Sub: concat, + Flags: r.Flags, + } + return changed, r +}