From c443adb9cd8fd5533beb669c6a0d8d4fb4d13385 Mon Sep 17 00:00:00 2001 From: Wade McEwen Date: Wed, 27 Jul 2016 16:37:11 -0500 Subject: [PATCH 1/2] AC-1604 sparkql built-in conjunction evaluator Here is a rewrite of the processing for the boolean algebra in sparkql evaluation. It takes a poorly modeled implementation that tracks nesting levels, to one that only cares about sparkql 'block_groups'. It turns out, these groups can be evaluated in a very consistent manner, and cleaned up this logic a ton. My first approach involved rebuilding the AST. That started to look expensive, and the original implementation kept things nice and compact, so I went back that direction, even though it's a little harder to debug. As far as my testing goes, this solution works for all the things the api supports, including the reported problems, and some dropped field resolution we were not doing before. --- lib/sparkql.rb | 2 + lib/sparkql/evaluator.rb | 152 ++++++++++++++++++ lib/sparkql/expression_resolver.rb | 11 ++ lib/sparkql/parser_tools.rb | 9 +- .../boolean_or_bust_expression_resolver.rb | 12 ++ test/unit/evaluator_test.rb | 84 ++++++++++ 6 files changed, 266 insertions(+), 4 deletions(-) create mode 100644 lib/sparkql/evaluator.rb create mode 100644 lib/sparkql/expression_resolver.rb create mode 100644 test/support/boolean_or_bust_expression_resolver.rb create mode 100644 test/unit/evaluator_test.rb diff --git a/lib/sparkql.rb b/lib/sparkql.rb index 4c81734..bba4494 100644 --- a/lib/sparkql.rb +++ b/lib/sparkql.rb @@ -2,6 +2,8 @@ require "sparkql/token" require "sparkql/errors" require "sparkql/expression_state" +require "sparkql/expression_resolver" +require "sparkql/evaluator" require "sparkql/lexer" require "sparkql/function_resolver" require "sparkql/parser_tools" diff --git a/lib/sparkql/evaluator.rb b/lib/sparkql/evaluator.rb new file mode 100644 index 0000000..ee30770 --- /dev/null +++ b/lib/sparkql/evaluator.rb @@ -0,0 +1,152 @@ +# Using an external source for resolving the individual sparkql expressions, +# this class will evaluate the rest of a parsed sparkql string to true or false. +# Namely, this class will handle all the nesting, boolean algebra, and dropped +# fields. Plus, it has some optimizations built in to skip the processing for +# any expressions that don't contribute to the net result of the filter. +class Sparkql::Evaluator + + attr_reader :processed_count + + def initialize expression_resolver + @resolver = expression_resolver + end + + def evaluate(expressions) + @processed_count = 0 + @index = { + level: 0, + block_group: 0, + conjunction: "And", + conjunction_level: 0, + match: true, + good_ors: false, + expressions: 0 + } + @groups = [@index] + expressions.each do |expression| + handle_group(expression) + adjust_expression_for_dropped_field(expression) + check_for_good_ors(expression) + next if skip?(expression) + result = evaluate_expression(expression) + end + cleanup + return @index[:match] + end + + private + + # prepare the group stack for the next expression + def handle_group(expression) + if @index[:block_group] == expression[:block_group] + # Noop + elsif @index[:block_group] < expression[:block_group] + @index = new_group(expression) + @groups.push(@index) + else + # Turn the group into an expression, resolve down to previous group(s) + smoosh_group(expression) + end + end + + # Here's the real meat. We use an internal stack to represent the result of + # each block_group. This logic is re-used when merging the final result of one + # block group with the previous. + def evaluate_expression(expression) + @processed_count += 1 + evaluate_node(expression, @resolver.resolve(expression)) + end + def evaluate_node(node, result) + if result == :drop + @dropped_expression = node + return result + end + if node[:unary] == "Not" + result = !result + end + if node[:conjunction] == 'Not' && + (node[:conjunction_level] == node[:level] || + node[:conjunction_level] == @index[:level]) + @index[:match] = !result + elsif node[:conjunction] == 'And' || @index[:expressions] == 0 + @index[:match] = result if @index[:match] + elsif node[:conjunction] == 'Or' && result + @index[:match] = result + end + @index[:expressions] += 1 + result + end + + # Optimization logic, once we find any set of And'd expressions that pass and + # run into an Or at the same level, we can skip further processing at that + # level. + def check_for_good_ors(expression) + if expression[:conjunction] == 'Or' + good_index = @index + unless expression[:conjunction_level] == @index[:level] + good_index = nil + # Well crap, now we need to go back and find that level by hand + @groups.reverse_each do |i| + if i[:level] == expression[:conjunction_level] + good_index = i + end + end + end + if !good_index.nil? && good_index[:expressions] > 0 && good_index[:match] + good_index[:good_ors] = true + end + end + end + + # We can skip further expression processing when And-d with a false expression + # or a "good Or" was already encountered. + def skip?(expression) + @index[:good_ors] || + !@index[:match] && expression[:conjunction] == 'And' + end + + def new_group(expression) + { + level: expression[:level], + block_group: expression[:block_group], + conjunction: expression[:conjunction], + conjunction_level: expression[:conjunction_level], + match: true, + good_ors: false, + expressions: 0 + } + end + + # When the last expression was dropped, we need to repair the filter by + # stealing the conjunction of that dropped field. + def adjust_expression_for_dropped_field(expression) + if @dropped_expression.nil? + return + elsif @dropped_expression[:block_group] == expression[:block_group] + expression[:conjunction] = @dropped_expression[:conjunction] + expression[:conjunction_level] = @dropped_expression[:conjunction_level] + end + @dropped_expression = nil + end + + # This is similar to the cleanup step, but happens when we return from a + # nesting level. Before we can proceed, we need wrap up the result of the + # nested group. + def smoosh_group(expression) + until @groups.last[:block_group] == expression[:block_group] + last = @groups.pop + @index = @groups.last + evaluate_node(last, last[:match]) + end + end + + # pop off the group stack, evaluating each group with the previous as we go. + def cleanup + while @groups.size > 1 + last = @groups.pop + @index = @groups.last + evaluate_node(last, last[:match]) + end + @groups.last[:match] + end +end diff --git a/lib/sparkql/expression_resolver.rb b/lib/sparkql/expression_resolver.rb new file mode 100644 index 0000000..0d841e2 --- /dev/null +++ b/lib/sparkql/expression_resolver.rb @@ -0,0 +1,11 @@ +# Base class for handling expression resolution +class Sparkql::ExpressionResolver + + VALID_RESULTS = [true, false, :drop] + + # Evaluate the result of this expression Allows for any of the values in + # VALID_RESULTS + def resolve(expression) + true + end +end diff --git a/lib/sparkql/parser_tools.rb b/lib/sparkql/parser_tools.rb index ca382a9..695b93e 100644 --- a/lib/sparkql/parser_tools.rb +++ b/lib/sparkql/parser_tools.rb @@ -60,16 +60,17 @@ def tokenize_conjunction(exp1, conj, exp2) end def tokenize_unary_conjunction(conj, exp) - # Handles the case when a SparkQL filter string # begins with a unary operator, and is nested, such as: - # Not (Not Field Eq 1) + # Not (Not Field Eq 1) + # In this instance we treat the outer unary as a conjunction. if @expression_count == 1 && @lexer.level > 0 - exp.first[:conjunction] = conj + exp.first[:conjunction] = conj + exp.first[:conjunction_level] = @lexer.level - 1 end - exp.first[:unary] = conj exp.first[:unary_level] = @lexer.level + exp end diff --git a/test/support/boolean_or_bust_expression_resolver.rb b/test/support/boolean_or_bust_expression_resolver.rb new file mode 100644 index 0000000..2cdf044 --- /dev/null +++ b/test/support/boolean_or_bust_expression_resolver.rb @@ -0,0 +1,12 @@ +# A super simple expression resolver for testing... returns the boolean value as +# the result for the expression, or when not a boolean, drops the expression. +class BooleanOrBustExpressionResolver < Sparkql::ExpressionResolver + + def resolve(expression) + if expression[:type] == :boolean + "true" == expression[:value] + else + :drop + end + end +end diff --git a/test/unit/evaluator_test.rb b/test/unit/evaluator_test.rb new file mode 100644 index 0000000..36b5c76 --- /dev/null +++ b/test/unit/evaluator_test.rb @@ -0,0 +1,84 @@ +require 'test_helper' +require 'support/boolean_or_bust_expression_resolver' + +class EvaluatorTest < Test::Unit::TestCase + include Sparkql + + def test_simple + assert sample('Test Eq true') + assert !sample('Test Eq false') + assert sample("Test Eq 'Drop'") + end + + def test_conjunction + assert sample('Test Eq true And Test Eq true') + assert !sample('Test Eq false And Test Eq true') + assert !sample('Test Eq false And Test Eq false') + # Ors + assert sample("Test Eq true Or Test Eq true") + assert sample("Test Eq true Or Test Eq false") + assert sample("Test Eq false Or Test Eq true") + assert !sample("Test Eq false Or Test Eq false") + end + + def test_dropped_field_handling + assert sample("Test Eq 'Drop' And Test Eq true") + assert !sample("Test Eq 'Drop' And Test Eq false") + assert !sample("Test Eq 'Drop' Or Test Eq false") + assert sample("Test Eq 'Drop' Or Test Eq true") + assert sample("Test Eq false And Test Eq 'Drop' Or Test Eq true") + assert sample("Test Eq false Or (Test Eq 'Drop' And Test Eq true)") + end + + def test_nesting + assert sample("Test Eq true Or (Test Eq true) And Test Eq false And (Test Eq true)") + assert sample("Test Eq true Or ((Test Eq false) And Test Eq false) And (Test Eq false)") + assert sample("(Test Eq false Or Test Eq true) Or (Test Eq false Or Test Eq false)") + assert sample("(Test Eq true And Test Eq true) Or (Test Eq false)") + assert sample("(Test Eq true And Test Eq true) Or (Test Eq false And Test Eq true)") + assert !sample("(Test Eq false And Test Eq true) Or (Test Eq false)") + assert sample("Test Eq true And ((Test Eq true And Test Eq false) Or Test Eq true)") + assert !sample("Test Eq true And ((Test Eq true And Test Eq false) Or Test Eq false) And Test Eq true") + assert !sample("Test Eq true And ((Test Eq true And Test Eq false) Or Test Eq false) Or Test Eq false") + assert sample("Test Eq true And ((Test Eq true And Test Eq false) Or Test Eq false) Or Test Eq true") + end + + def test_nots + assert !sample("Not Test Eq true") + assert sample("Not Test Eq false") + assert !sample("Not (Test Eq true)") + assert sample("Not (Test Eq false)") + assert sample("Test Eq true Not Test Eq false") + assert !sample("Test Eq true Not Test Eq true") + assert sample("Test Eq true Not (Test Eq false Or Test Eq false)") + assert sample("Test Eq true Not (Test Eq false And Test Eq false)") + assert !sample("Test Eq true Not (Test Eq false Or Test Eq true)") + assert !sample("Test Eq true Not (Test Eq true Or Test Eq false)") + assert !sample("Test Eq true Not (Not Test Eq false)") + assert sample("Not (Not Test Eq true)") + assert sample("Not (Not(Not Test Eq true))") + end + + def test_optimizations + assert sample("Test Eq true Or Test Eq false And Test Eq false") + assert_equal 1, @evaluator.processed_count + assert sample("Test Eq false Or Test Eq true And Test Eq true") + assert_equal 3, @evaluator.processed_count + assert sample("(Test Eq true Or Test Eq false) And Test Eq true") + assert_equal 2, @evaluator.processed_count + assert sample("(Test Eq false Or Test Eq true) And Test Eq true") + assert_equal 3, @evaluator.processed_count + end + + # Here's some examples from prospector's tests that have been simplified a bit. + def test_advanced + assert !sample("MlsStatus Eq false And PropertyType Eq true And (City Eq true Or City Eq false)") + end + + def sample filter + @parser = Parser.new + @expressions = @parser.parse(filter) + @evaluator = Evaluator.new(BooleanOrBustExpressionResolver.new()) + @evaluator.evaluate(@expressions) + end +end From 75275373271dd9577110658a5a6be0a1ea7c0578 Mon Sep 17 00:00:00 2001 From: Wade McEwen Date: Thu, 28 Jul 2016 10:51:57 -0500 Subject: [PATCH 2/2] Minor release --- CHANGELOG.md | 4 ++++ VERSION | 2 +- lib/sparkql/evaluator.rb | 4 ++-- lib/sparkql/expression_resolver.rb | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34937c7..df1d8c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +v1.1.0, 2016-07-28 ([changes](https://github.com/sparkapi/sparkql/compare/v1.0.3...v1.1.0)) +------------------- + * [IMPROVEMENT] Evaluation class for sparkql boolean algebra processing + v1.0.3, 2016-06-06 ([changes](https://github.com/sparkapi/sparkql/compare/v1.0.2...v1.0.3)) ------------------- * [IMPROVEMENT] Expression limit lifted to 75 expressions diff --git a/VERSION b/VERSION index 21e8796..9084fa2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.0.3 +1.1.0 diff --git a/lib/sparkql/evaluator.rb b/lib/sparkql/evaluator.rb index ee30770..077e1c6 100644 --- a/lib/sparkql/evaluator.rb +++ b/lib/sparkql/evaluator.rb @@ -1,4 +1,4 @@ -# Using an external source for resolving the individual sparkql expressions, +# Using an instance of ExpressionResolver to resolve the individual expressions, # this class will evaluate the rest of a parsed sparkql string to true or false. # Namely, this class will handle all the nesting, boolean algebra, and dropped # fields. Plus, it has some optimizations built in to skip the processing for @@ -28,7 +28,7 @@ def evaluate(expressions) adjust_expression_for_dropped_field(expression) check_for_good_ors(expression) next if skip?(expression) - result = evaluate_expression(expression) + evaluate_expression(expression) end cleanup return @index[:match] diff --git a/lib/sparkql/expression_resolver.rb b/lib/sparkql/expression_resolver.rb index 0d841e2..e837288 100644 --- a/lib/sparkql/expression_resolver.rb +++ b/lib/sparkql/expression_resolver.rb @@ -3,7 +3,7 @@ class Sparkql::ExpressionResolver VALID_RESULTS = [true, false, :drop] - # Evaluate the result of this expression Allows for any of the values in + # Evaluate the result of this expression. Allows for any of the values in # VALID_RESULTS def resolve(expression) true