From 6fbab3067b1e249cc90917cee37d625c8a030f5d Mon Sep 17 00:00:00 2001
From: Alexa Grey <alexa@castironcoding.com>
Date: Tue, 23 Jul 2024 16:46:30 -0700
Subject: [PATCH] [B] Refactor AdoptOrOrphan job

* Add uniqueness to the job to prevent multiple adoptions for a single
  annotation
* Add cyclical detections and execution time limits
  * Restrict an attempt to locate the character positions from
    iterating endlessly
  * Monitor execution time on various iterations / recursions and
    bail out if it is taking too long
* Any bailout action will orphan the annotation

Resolves RET-1728
---
 .../annotation_jobs/adopt_or_orphan_job.rb    |  10 +-
 .../services/annotations/adopt_or_orphan.rb   | 277 ++++++++++++------
 .../annotations/adoption_assignment.rb        |  36 +++
 api/config/sidekiq.yml                        |   1 +
 4 files changed, 237 insertions(+), 87 deletions(-)
 create mode 100644 api/app/services/annotations/adoption_assignment.rb
diff --git a/api/app/jobs/annotation_jobs/adopt_or_orphan_job.rb b/api/app/jobs/annotation_jobs/adopt_or_orphan_job.rb
index 2fda8a5d6e..ea5f55ff33 100644
--- a/api/app/jobs/annotation_jobs/adopt_or_orphan_job.rb
+++ b/api/app/jobs/annotation_jobs/adopt_or_orphan_job.rb
@@ -1,8 +1,16 @@
+# frozen_string_literal: true
+
 module AnnotationJobs
+  # @see Annotations::AdoptOrOrphan
   class AdoptOrOrphanJob < ApplicationJob
+    queue_as :annotations
+
+    unique :until_executed, lock_ttl: 15.minutes, on_conflict: :log
 
+    # @param [Annotation] annotation
+    # @return [void]
     def perform(annotation)
-      Annotations::AdoptOrOrphan.run annotation: annotation
+      Annotations::AdoptOrOrphan.run! annotation: annotation
     end
   end
 end
diff --git a/api/app/services/annotations/adopt_or_orphan.rb b/api/app/services/annotations/adopt_or_orphan.rb
index 04ff952d6a..c1af4602c4 100644
--- a/api/app/services/annotations/adopt_or_orphan.rb
+++ b/api/app/services/annotations/adopt_or_orphan.rb
@@ -1,47 +1,136 @@
+# frozen_string_literal: true
+
 module Annotations
   # This service attempts to reassign an annotation's start/end positions based on
   # subject content. To keep things sane, we only assume a node is correct
   # if a block of its content is is an exact match to a piece of the subject.
+  # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
   class AdoptOrOrphan < ActiveInteraction::Base
-
     extend Memoist
 
+    # The maximum amount of time (elapsed) this service can run, per annotation.
+    # Each iteration is checked.
+    MAX_EXECUTION_TIME = 60.seconds
+
     record :annotation
+
     delegate :text_section, to: :annotation
-    delegate :text_nodes, to: :text_section
 
+    # @return [Boolean]
+    attr_reader :adoptable
+
+    alias adoptable? adoptable
+
+    # @return [Annotations::AdoptionAssignment]
+    attr_reader :adoption_assignment
+
+    # @return [Integer]
+    attr_reader :final_index
+
+    # @return [String]
+    attr_reader :haystack
+
+    # @return [Boolean]
+    attr_reader :multiple_occurrences
+
+    # @return [String]
+    attr_reader :needle
+
+    # @return [<String>]
+    attr_reader :occurrences
+
+    # @return [Integer]
+    attr_reader :start_index
+
+    # @return [ActiveSupport::TimeWithZone]
+    attr_reader :started_at
+
+    # @return [<Hash>]
+    attr_reader :text_nodes
+
+    alias multiple_occurrences? multiple_occurrences
+
+    # @return [Annotation]
     def execute
-      return orphan_annotation unless adoptable?
+      prepare!
+
+      derive_facts!
+
+      return orphan_annotation! unless adoptable?
+
+      derive_possible_adoption!
+
+      return orphan_annotation! unless adoption_assignment.valid?
+
+      if adoption_assignment.valid?
+        adoption_assignment.adopt! annotation
+      else
+        orphan_annotation!
+      end
+    rescue HaltError
+      # :nocov:
+      orphan_annotation!
+      # :nocov:
+    end
+
+    private
+
+    # @!group Steps
+
+    # @return [void]
+    def prepare!
+      @text_nodes = text_section.text_nodes
+      @needle = collapse(annotation.subject)
+
+      @haystack = collapse(text_nodes.pluck(:content).join)
 
+      pattern = /#{Regexp.escape(needle)}/i
+
+      @start_index = haystack.index pattern
+      @final_index = start_index + needle.size - 1 if start_index
+      @occurrences = haystack.scan pattern
+
+      @started_at = Time.current
+    end
+
+    # @return [void]
+    def derive_facts!
+      @multiple_occurrences = @occurrences.many?
+      @adoptable = start_index.present? && final_index.present?
+    end
+
+    # @return [void]
+    def derive_possible_adoption!
       # iterate through the haystack with all whitespace collapsed to assign start and end nodes
       # if the needle has more than one match, attempt to locate it by node uuids
-      node_updates = if multiple_occurrences?
-                       maybe_locate_occurence
-                     else
-                       find_needle_in_haystack(text_nodes, start_index, final_index)
-                     end
+      node_updates =
+        if multiple_occurrences?
+          maybe_locate_occurrence
+        else
+          find_needle_in_haystack(text_nodes, start_index, final_index)
+        end
 
       # iterate through the text content of the new nodes accounting for whitespace to find the correct start and end chars
       char_updates = place_subject_in_nodes(node_updates)
 
-      updates = node_updates.merge(char_updates)
-
-      return orphan_annotation unless all_present? updates
-
-      adopt_annotation(updates)
+      @adoption_assignment = Annotations::AdoptionAssignment.new(**node_updates, **char_updates)
     end
 
-    private
+    # @!endgroup
 
-    # rubocop:disable Metrics/MethodLength
-    def find_needle_in_haystack(a_haystack, first_index, last_index)
+    # @param [<Hash>] node_subset
+    # @param [Integer] first_index
+    # @param [Integer] last_index
+    # @return [Hash]
+    def find_needle_in_haystack(node_subset, first_index, last_index)
       updates = {
         start_node: nil,
         end_node: nil,
       }
 
       haystack_iterator = 0
-      a_haystack.each do |node|
+
+      node_subset.each do |node|
         node_text_iterator = 0
         node_content = collapse(node[:content])
         node_content.split("") do
@@ -53,15 +142,16 @@ def find_needle_in_haystack(a_haystack, first_index, last_index)
           haystack_iterator += 1
           node_text_iterator += 1
         end
+      ensure
+        check_execution_time!
       end
 
       updates
     end
-    # rubocop:enable Metrics/MethodLength
 
-    # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
-    def maybe_locate_occurence
+    def maybe_locate_occurrence
       start_node = text_nodes.find { |node| node[:node_uuid] == annotation.start_node }
+
       unless start_node
         return {
           start_node: nil,
@@ -85,15 +175,17 @@ def maybe_locate_occurence
           }
         end
 
-        haystack = text_nodes[start_node_index..end_node_index]
+        node_subset = text_nodes[start_node_index..end_node_index]
         first_index = annotation.start_char - 1
         last_index = first_index + needle.size - 1
 
-        find_needle_in_haystack(haystack, first_index, last_index)
+        find_needle_in_haystack(node_subset, first_index, last_index)
       end
     end
-    # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
 
+    # @param [Hash] nodes
+    # @option nodes [String] :start_node
+    # @option nodes [String] :end_node
     def place_subject_in_nodes(nodes)
       start = text_nodes.find { |node| node[:node_uuid] == nodes[:start_node] }
       final = text_nodes.find { |node| node[:node_uuid] == nodes[:end_node] }
@@ -114,116 +206,129 @@ def place_subject_in_nodes(nodes)
       }
     end
 
+    # @param [Hash] node
+    # @param [Boolean] end_node
+    # @see #find_substr_index
+    # @return [Integer]
     def place_in_node(node:, end_node: false)
       # split subject into chars ignoring whitespace
-      splits = annotation.subject.split(/\s*/)
+      # escape regex metachars so they match
+      splits = annotation.subject.split(/\s*/).map { |c| Regexp.escape(c) }
       splits.reverse! if end_node
 
       text_content = node[:content]
-      node_occurrences = text_content.scan(splits[0])
+
+      node_occurrences = text_content.scan(/#{splits[0]}/i)
 
       # if we can't find the first char, something went wrong; shouldn't happen
-      return nil if node_occurrences.size.zero?
+      return nil if node_occurrences.blank?
       # if the first char is unique in the node, return its index
-      return text_content.index(splits[0]) if node_occurrences.size == 1
+      return text_content.index(splits[0]) if node_occurrences.one?
 
       # if it occurs more than once, iteratively expand the search
       find_substr_index(splits: splits, text_content: text_content, count: 1, from_end: end_node)
     end
 
-    # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
     # iterate by character until we find the shortest unique substr at the start or end of the subject in the respective node
     # return the index of that substr in the node text content
-    def find_substr_index(splits:, text_content:, count:, from_end: false)
-      # escape regex metachars so they match
-      splits = splits.map { |c| metachars.include?(c) ? "\\#{c}" : c }
+    #
+    # @param [<String>] splits array of characters to search for
+    # @param [String] text_content node content to search for the substr within
+    # @param [Integer] count recursion count for finding the unique occurrence within
+    # @param [Boolean] from_end whether this should start looking from the end of the textual content / `splits`.
+    # @param [Regexp] prev_substr the pattern for the previous iteration (@see #substr_regex)
+    # @raise [Annotations::AdoptOrOrphan::TooManyCandidates]
+    # @return [Integer]
+    def find_substr_index(
+      splits:, text_content:,
+      count: 1, from_end: false,
+      prev_substr: substr_regex(splits: splits, count: count - 1, from_end: from_end)
+    )
+      max_count = splits.length + 2
+
+      raise TooManyCandidates, "max_count attempts exceeded: #{count}" if count > max_count
+
+      check_execution_time!
+
       substr = substr_regex(splits: splits, count: count, from_end: from_end)
-      node_occurrences = text_content.scan(substr)
 
-      prev_substr = substr_regex(splits: splits, count: count - 1, from_end: from_end)
+      node_occurrences = text_content.scan(substr)
 
       if from_end
         # if we don't find a match, we want the first occurrence of the previous search term
         # use the match data to find the length of the substr in the text node in case the client added or removed whitespace
-        return text_content.index(prev_substr) + prev_substr.match(text_content)[0].length - 1 if node_occurrences.size.zero?
-        return text_content.index(substr) + substr.match(text_content)[0].length - 1 if node_occurrences.size == 1
+        return text_content.index(prev_substr) + prev_substr.match(text_content)[0].length - 1 if node_occurrences.empty?
+        return text_content.index(substr) + substr.match(text_content)[0].length - 1 if node_occurrences.one?
       else
         # if we don't find a match, we want the last occurrence of the previous search term
-        return text_content.rindex(prev_substr) if node_occurrences.size.zero?
-        return text_content.index(substr) if node_occurrences.size == 1
+        return text_content.rindex(prev_substr) if node_occurrences.empty?
+        return text_content.index(substr) if node_occurrences.one?
       end
 
-      find_substr_index(splits: splits, text_content: text_content, count: count + 1, from_end: from_end)
+      # Recur with a higher count
+      find_substr_index(splits: splits, text_content: text_content, count: count + 1, from_end: from_end, prev_substr: substr)
     end
-    # rubocop:enable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
-
-    def substr_regex(splits:, count:, from_end:)
-      i = 0
-      substr = /#{splits[0]}/i
-
-      while i < count
-        substr = if from_end
-                   /#{splits[i + 1]}[[:space:]]*#{substr}/i
-                 else
-                   /#{substr}[[:space:]]*#{splits[i + 1]}/i
-                 end
-        i += 1
-      end
 
-      substr
-    end
+    # @param [<String>] splits array of characters to search for
+    # @param [Integer] count recursion count for finding the unique occurrence within
+    # @param [Boolean] from_end whether this should start looking from the end of the textual content / `splits`.
+    # @return [Regexp] pattern for use in {#find_substr_index}
+    def substr_regex(splits:, count:, from_end: false)
+      len = count.to_i.clamp(1..)
 
-    def metachars
-      %w[( ) [ ] { } . ? + *]
-    end
+      chars = splits.take(len)
 
-    def adoptable?
-      return false unless start_index.present? && final_index.present?
+      chars.reverse! if from_end
 
-      true
+      /#{chars.join("[[:space:]]*")}/i
     end
 
-    def all_present?(hash)
-      hash.values.all?
+    # @!group Utility Methods
+
+    # @raise [Annotations::AdoptOrOrphan::ExecutionTimeExceeded]
+    # @return [void]
+    def check_execution_time!
+      raise ExecutionTimeExceeded if elapsed_time > MAX_EXECUTION_TIME
     end
 
+    # @param [String] input
+    # @return [String]
     def collapse(input)
       input.gsub(/[[:space:]]+/, "")
     end
 
-    def needle
-      collapse(annotation.subject)
+    # @return [ActiveSupport::Duration]
+    def elapsed_time
+      started_at.present? ? Time.current - started_at : 0
     end
 
-    memoize def start_index
-      haystack.index(/#{needle}/i)
-    end
+    # @return [Annotation]
+    def orphan_annotation!
+      annotation.update! orphaned: true
 
-    memoize def final_index
-      start_index + needle.size - 1
+      return annotation
     end
 
-    memoize def occurrences
-      haystack.scan(/#{needle}/i)
+    # @param [<String>] prev
+    # @param [<String>] curr
+    def repeated_occurrences?(prev, curr)
+      prev.present? && curr.present? && prev == curr
     end
 
-    def multiple_occurrences?
-      occurrences.length > 1
-    end
+    # @!endgroup
 
-    memoize def haystack
-      collapse(text_nodes.map { |node| node[:content] }.join)
-    end
+    # @abstract
+    class HaltError < StandardError; end
 
-    def orphan_annotation
-      annotation.assign_attributes orphaned: true
-      annotation.save
-    end
-
-    def adopt_annotation(candidates)
-      annotation.assign_attributes candidates.merge(orphaned: false)
-      annotation.save
-    end
+    # A private error raised when this service has run for too long
+    # trying to match things. It will short-circuit and orphan the
+    # annotation in such an event.
+    class ExecutionTimeExceeded < HaltError; end
 
+    # A private error raised when trying to find the candidate
+    # to remap. For very short annotations, or very common phrases,
+    # this will bail out instead of infinitely looping.
+    class TooManyCandidates < HaltError; end
   end
+  # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
 end
diff --git a/api/app/services/annotations/adoption_assignment.rb b/api/app/services/annotations/adoption_assignment.rb
new file mode 100644
index 0000000000..8984c8478b
--- /dev/null
+++ b/api/app/services/annotations/adoption_assignment.rb
@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+
+module Annotations
+  # @api private
+  # @see Annotations::AdoptOrOrphan
+  class AdoptionAssignment < ::Types::FlexibleStruct
+    include ActiveModel::Validations
+
+    # A type for matching {#start_char} and {#end_char}
+    Index = ::Types::Coercible::Integer.constrained(gteq: 0).fallback { nil }
+
+    # A type for matching {#start_node} and {#end_node}
+    Node = ::Types::Coercible::String.constrained(filled: true).fallback { nil }
+
+    attribute? :start_node, Node.optional
+    attribute? :end_node, Node.optional
+
+    attribute? :start_char, Index.optional
+    attribute? :end_char, Index.optional
+
+    validates :start_node, :end_node, :start_char, :end_char, presence: true
+
+    # @param [Annotation] annotation
+    # @return [Annotation]
+    def adopt!(annotation)
+      annotation.update! to_adopt
+
+      return annotation
+    end
+
+    # @return [Hash]
+    def to_adopt
+      slice(:start_node, :end_node, :start_char, :end_char).merge(orphaned: false)
+    end
+  end
+end
diff --git a/api/config/sidekiq.yml b/api/config/sidekiq.yml
index 266b52c172..d9b9151543 100644
--- a/api/config/sidekiq.yml
+++ b/api/config/sidekiq.yml
@@ -4,3 +4,4 @@
   - low_priority
   - searchkick
   - ahoy
+  - annotations