Merge pull request #42 from wlmcewen/master

AC-1604 sparkql built-in conjunction evaluator
sparkapi · Jul 28, 2016 · c4b7187 · c4b7187
2 parents 46f516a + 7527537
commit c4b7187
Show file tree

Hide file tree

Showing 8 changed files with 271 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+v1.1.0, 2016-07-28 ([changes](https://github.com/sparkapi/sparkql/compare/v1.0.3...v1.1.0))
+-------------------
+  * [IMPROVEMENT] Evaluation class for sparkql boolean algebra processing
+
 v1.0.3, 2016-06-06 ([changes](https://github.com/sparkapi/sparkql/compare/v1.0.2...v1.0.3))
 -------------------
   * [IMPROVEMENT] Expression limit lifted to 75 expressions

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.0.3
+1.1.0
diff --git a/lib/sparkql.rb b/lib/sparkql.rb
@@ -2,6 +2,8 @@
 require "sparkql/token"
 require "sparkql/errors"
 require "sparkql/expression_state"
+require "sparkql/expression_resolver"
+require "sparkql/evaluator"
 require "sparkql/lexer"
 require "sparkql/function_resolver"
 require "sparkql/parser_tools"

diff --git a/lib/sparkql/evaluator.rb b/lib/sparkql/evaluator.rb
@@ -0,0 +1,152 @@
+# Using an instance of ExpressionResolver to resolve the individual expressions,
+# this class will evaluate the rest of a parsed sparkql string to true or false.
+# Namely, this class will handle all the nesting, boolean algebra, and dropped 
+# fields. Plus, it has some optimizations built in to skip the processing for
+# any expressions that don't contribute to the net result of the filter.
+class Sparkql::Evaluator
+
+  attr_reader :processed_count
+
+  def initialize expression_resolver
+    @resolver = expression_resolver
+  end
+
+  def evaluate(expressions)
+    @processed_count = 0
+    @index = {
+      level: 0,
+      block_group: 0,
+      conjunction: "And",
+      conjunction_level: 0,
+      match: true,
+      good_ors: false,
+      expressions: 0
+    }
+    @groups = [@index]
+    expressions.each do |expression|
+      handle_group(expression)
+      adjust_expression_for_dropped_field(expression)
+      check_for_good_ors(expression)
+      next if skip?(expression)
+      evaluate_expression(expression)
+    end
+    cleanup
+    return @index[:match]
+  end
+
+  private
+
+  # prepare the group stack for the next expression
+  def handle_group(expression)
+    if @index[:block_group] == expression[:block_group]
+      # Noop
+    elsif @index[:block_group] < expression[:block_group]
+      @index = new_group(expression)
+      @groups.push(@index)
+    else
+      # Turn the group into an expression, resolve down to previous group(s)
+      smoosh_group(expression)
+    end
+  end
+
+  # Here's the real meat. We use an internal stack to represent the result of
+  # each block_group. This logic is re-used when merging the final result of one
+  # block group with the previous.
+  def evaluate_expression(expression)
+      @processed_count += 1
+    evaluate_node(expression, @resolver.resolve(expression))
+  end
+  def evaluate_node(node, result)
+    if result == :drop
+      @dropped_expression = node
+      return result
+    end
+    if node[:unary] == "Not"
+      result = !result
+    end
+    if node[:conjunction] == 'Not' &&
+       (node[:conjunction_level] == node[:level] ||
+      node[:conjunction_level] == @index[:level])
+      @index[:match] = !result
+    elsif node[:conjunction] == 'And' || @index[:expressions] == 0
+      @index[:match] = result if @index[:match]
+    elsif node[:conjunction] == 'Or' && result
+      @index[:match] = result
+    end
+    @index[:expressions] += 1
+    result
+  end
+
+  # Optimization logic, once we find any set of And'd expressions that pass and
+  # run into an Or at the same level, we can skip further processing at that
+  # level.
+  def check_for_good_ors(expression)
+    if expression[:conjunction] == 'Or'
+      good_index = @index
+      unless expression[:conjunction_level] == @index[:level]
+        good_index = nil
+        # Well crap, now we need to go back and find that level by hand
+        @groups.reverse_each do |i|
+          if i[:level] == expression[:conjunction_level]
+            good_index = i
+          end
+        end
+      end
+      if !good_index.nil? && good_index[:expressions] > 0 && good_index[:match]
+        good_index[:good_ors] = true
+      end
+    end
+  end
+
+  # We can skip further expression processing when And-d with a false expression
+  # or a "good Or" was already encountered.
+  def skip?(expression)
+    @index[:good_ors] ||
+      !@index[:match] && expression[:conjunction] == 'And'
+  end
+
+  def new_group(expression)
+    {
+      level: expression[:level],
+      block_group: expression[:block_group],
+      conjunction: expression[:conjunction],
+      conjunction_level: expression[:conjunction_level],
+      match: true,
+      good_ors: false,
+      expressions: 0
+    }
+  end
+
+  # When the last expression was dropped, we need to repair the filter by 
+  # stealing the conjunction of that dropped field.
+  def adjust_expression_for_dropped_field(expression)
+    if @dropped_expression.nil?
+      return
+    elsif @dropped_expression[:block_group] == expression[:block_group]
+      expression[:conjunction] = @dropped_expression[:conjunction]
+      expression[:conjunction_level] = @dropped_expression[:conjunction_level]
+    end
+    @dropped_expression = nil
+  end
+
+  # This is similar to the cleanup step, but happens when we return from a 
+  # nesting level. Before we can proceed, we need wrap up the result of the 
+  # nested group.
+  def smoosh_group(expression)
+    until  @groups.last[:block_group] == expression[:block_group]
+      last = @groups.pop
+      @index = @groups.last
+      evaluate_node(last, last[:match])
+    end
+  end
+
+  # pop off the group stack, evaluating each group with the previous as we go.
+  def cleanup
+    while @groups.size > 1
+      last = @groups.pop
+      @index = @groups.last
+      evaluate_node(last, last[:match])
+    end
+    @groups.last[:match]
+  end
+end
diff --git a/lib/sparkql/expression_resolver.rb b/lib/sparkql/expression_resolver.rb
@@ -0,0 +1,11 @@
+# Base class for handling expression resolution
+class Sparkql::ExpressionResolver
+
+  VALID_RESULTS = [true, false, :drop]
+
+  # Evaluate the result of this expression. Allows for any of the values in
+  # VALID_RESULTS
+  def resolve(expression)
+    true
+  end
+end
diff --git a/lib/sparkql/parser_tools.rb b/lib/sparkql/parser_tools.rb
@@ -60,16 +60,17 @@ def tokenize_conjunction(exp1, conj, exp2)
   end
 
   def tokenize_unary_conjunction(conj, exp)
-
     # Handles the case when a SparkQL filter string
     # begins with a unary operator, and is nested, such as:
-    # Not (Not Field Eq 1)
+    #   Not (Not Field Eq 1)
+    # In this instance we treat the outer unary as a conjunction.
     if @expression_count == 1 && @lexer.level > 0
-      exp.first[:conjunction] = conj 
+      exp.first[:conjunction] = conj
+      exp.first[:conjunction_level] = @lexer.level - 1
     end
-
     exp.first[:unary] = conj
     exp.first[:unary_level] = @lexer.level
+
     exp
   end
 

diff --git a/test/support/boolean_or_bust_expression_resolver.rb b/test/support/boolean_or_bust_expression_resolver.rb
@@ -0,0 +1,12 @@
+# A super simple expression resolver for testing... returns the boolean value as
+# the result for the expression, or when not a boolean, drops the expression.
+class BooleanOrBustExpressionResolver < Sparkql::ExpressionResolver
+
+  def resolve(expression)
+    if expression[:type] == :boolean
+      "true" == expression[:value]
+    else
+      :drop
+    end
+  end
+end
diff --git a/test/unit/evaluator_test.rb b/test/unit/evaluator_test.rb
@@ -0,0 +1,84 @@
+require 'test_helper'
+require 'support/boolean_or_bust_expression_resolver'
+
+class EvaluatorTest < Test::Unit::TestCase
+  include Sparkql
+
+  def test_simple
+    assert sample('Test Eq true')
+    assert !sample('Test Eq false')
+    assert sample("Test Eq 'Drop'")
+  end
+
+  def test_conjunction
+    assert sample('Test Eq true And Test Eq true')
+    assert !sample('Test Eq false And Test Eq true')
+    assert !sample('Test Eq false And Test Eq false')
+    # Ors
+    assert sample("Test Eq true Or Test Eq true")
+    assert sample("Test Eq true Or Test Eq false")
+    assert sample("Test Eq false Or Test Eq true")
+    assert !sample("Test Eq false Or Test Eq false")
+  end
+
+  def test_dropped_field_handling
+    assert sample("Test Eq 'Drop' And Test Eq true")
+    assert !sample("Test Eq 'Drop' And Test Eq false")
+    assert !sample("Test Eq 'Drop' Or Test Eq false")
+    assert sample("Test Eq 'Drop' Or Test Eq true")
+    assert sample("Test Eq false And Test Eq 'Drop' Or Test Eq true")
+    assert sample("Test Eq false Or (Test Eq 'Drop' And Test Eq true)")
+  end
+
+  def test_nesting
+    assert sample("Test Eq true Or (Test Eq true) And Test Eq false And (Test Eq true)")
+    assert sample("Test Eq true Or ((Test Eq false) And Test Eq false) And (Test Eq false)")
+    assert sample("(Test Eq false Or Test Eq true) Or (Test Eq false Or Test Eq false)")
+    assert sample("(Test Eq true And Test Eq true) Or (Test Eq false)")
+    assert sample("(Test Eq true And Test Eq true) Or (Test Eq false And Test Eq true)")
+    assert !sample("(Test Eq false And Test Eq true) Or (Test Eq false)")
+    assert sample("Test Eq true And ((Test Eq true And Test Eq false) Or Test Eq true)")
+    assert !sample("Test Eq true And ((Test Eq true And Test Eq false) Or Test Eq false) And Test Eq true")
+    assert !sample("Test Eq true And ((Test Eq true And Test Eq false) Or Test Eq false) Or Test Eq false")
+    assert sample("Test Eq true And ((Test Eq true And Test Eq false) Or Test Eq false) Or Test Eq true")
+  end
+
+  def test_nots
+    assert !sample("Not Test Eq true")
+    assert sample("Not Test Eq false")
+    assert !sample("Not (Test Eq true)")
+    assert sample("Not (Test Eq false)")
+    assert sample("Test Eq true Not Test Eq false")
+    assert !sample("Test Eq true Not Test Eq true")
+    assert sample("Test Eq true Not (Test Eq false Or Test Eq false)")
+    assert sample("Test Eq true Not (Test Eq false And Test Eq false)")
+    assert !sample("Test Eq true Not (Test Eq false Or Test Eq true)")
+    assert !sample("Test Eq true Not (Test Eq true Or Test Eq false)")
+    assert !sample("Test Eq true Not (Not Test Eq false)")
+    assert sample("Not (Not Test Eq true)")
+    assert sample("Not (Not(Not Test Eq true))")
+  end
+
+  def test_optimizations
+    assert sample("Test Eq true Or Test Eq false And Test Eq false")
+    assert_equal 1, @evaluator.processed_count
+    assert sample("Test Eq false Or Test Eq true And Test Eq true")
+    assert_equal 3, @evaluator.processed_count
+    assert sample("(Test Eq true Or Test Eq false) And Test Eq true")
+    assert_equal 2, @evaluator.processed_count
+    assert sample("(Test Eq false Or Test Eq true) And Test Eq true")
+    assert_equal 3, @evaluator.processed_count
+  end
+
+  # Here's some examples from prospector's tests that have been simplified a bit.
+  def test_advanced
+    assert !sample("MlsStatus Eq false And PropertyType Eq true And (City Eq true Or City Eq false)")
+  end
+
+  def sample filter
+    @parser = Parser.new
+    @expressions = @parser.parse(filter)
+    @evaluator = Evaluator.new(BooleanOrBustExpressionResolver.new())
+    @evaluator.evaluate(@expressions)
+  end
+end