From 6fbab3067b1e249cc90917cee37d625c8a030f5d Mon Sep 17 00:00:00 2001 From: Alexa Grey Date: Tue, 23 Jul 2024 16:46:30 -0700 Subject: [PATCH] [B] Refactor AdoptOrOrphan job * Add uniqueness to the job to prevent multiple adoptions for a single annotation * Add cyclical detections and execution time limits * Restrict an attempt to locate the character positions from iterating endlessly * Monitor execution time on various iterations / recursions and bail out if it is taking too long * Any bailout action will orphan the annotation Resolves RET-1728 --- .../annotation_jobs/adopt_or_orphan_job.rb | 10 +- .../services/annotations/adopt_or_orphan.rb | 277 ++++++++++++------ .../annotations/adoption_assignment.rb | 36 +++ api/config/sidekiq.yml | 1 + 4 files changed, 237 insertions(+), 87 deletions(-) create mode 100644 api/app/services/annotations/adoption_assignment.rb diff --git a/api/app/jobs/annotation_jobs/adopt_or_orphan_job.rb b/api/app/jobs/annotation_jobs/adopt_or_orphan_job.rb index 2fda8a5d6e..ea5f55ff33 100644 --- a/api/app/jobs/annotation_jobs/adopt_or_orphan_job.rb +++ b/api/app/jobs/annotation_jobs/adopt_or_orphan_job.rb @@ -1,8 +1,16 @@ +# frozen_string_literal: true + module AnnotationJobs + # @see Annotations::AdoptOrOrphan class AdoptOrOrphanJob < ApplicationJob + queue_as :annotations + + unique :until_executed, lock_ttl: 15.minutes, on_conflict: :log + # @param [Annotation] annotation + # @return [void] def perform(annotation) - Annotations::AdoptOrOrphan.run annotation: annotation + Annotations::AdoptOrOrphan.run! annotation: annotation end end end diff --git a/api/app/services/annotations/adopt_or_orphan.rb b/api/app/services/annotations/adopt_or_orphan.rb index 04ff952d6a..c1af4602c4 100644 --- a/api/app/services/annotations/adopt_or_orphan.rb +++ b/api/app/services/annotations/adopt_or_orphan.rb @@ -1,47 +1,136 @@ +# frozen_string_literal: true + module Annotations # This service attempts to reassign an annotation's start/end positions based on # subject content. To keep things sane, we only assume a node is correct # if a block of its content is is an exact match to a piece of the subject. + # rubocop:disable Metrics/AbcSize, Metrics/MethodLength class AdoptOrOrphan < ActiveInteraction::Base - extend Memoist + # The maximum amount of time (elapsed) this service can run, per annotation. + # Each iteration is checked. + MAX_EXECUTION_TIME = 60.seconds + record :annotation + delegate :text_section, to: :annotation - delegate :text_nodes, to: :text_section + # @return [Boolean] + attr_reader :adoptable + + alias adoptable? adoptable + + # @return [Annotations::AdoptionAssignment] + attr_reader :adoption_assignment + + # @return [Integer] + attr_reader :final_index + + # @return [String] + attr_reader :haystack + + # @return [Boolean] + attr_reader :multiple_occurrences + + # @return [String] + attr_reader :needle + + # @return [] + attr_reader :occurrences + + # @return [Integer] + attr_reader :start_index + + # @return [ActiveSupport::TimeWithZone] + attr_reader :started_at + + # @return [] + attr_reader :text_nodes + + alias multiple_occurrences? multiple_occurrences + + # @return [Annotation] def execute - return orphan_annotation unless adoptable? + prepare! + + derive_facts! + + return orphan_annotation! unless adoptable? + + derive_possible_adoption! + + return orphan_annotation! unless adoption_assignment.valid? + + if adoption_assignment.valid? + adoption_assignment.adopt! annotation + else + orphan_annotation! + end + rescue HaltError + # :nocov: + orphan_annotation! + # :nocov: + end + + private + + # @!group Steps + + # @return [void] + def prepare! + @text_nodes = text_section.text_nodes + @needle = collapse(annotation.subject) + + @haystack = collapse(text_nodes.pluck(:content).join) + pattern = /#{Regexp.escape(needle)}/i + + @start_index = haystack.index pattern + @final_index = start_index + needle.size - 1 if start_index + @occurrences = haystack.scan pattern + + @started_at = Time.current + end + + # @return [void] + def derive_facts! + @multiple_occurrences = @occurrences.many? + @adoptable = start_index.present? && final_index.present? + end + + # @return [void] + def derive_possible_adoption! # iterate through the haystack with all whitespace collapsed to assign start and end nodes # if the needle has more than one match, attempt to locate it by node uuids - node_updates = if multiple_occurrences? - maybe_locate_occurence - else - find_needle_in_haystack(text_nodes, start_index, final_index) - end + node_updates = + if multiple_occurrences? + maybe_locate_occurrence + else + find_needle_in_haystack(text_nodes, start_index, final_index) + end # iterate through the text content of the new nodes accounting for whitespace to find the correct start and end chars char_updates = place_subject_in_nodes(node_updates) - updates = node_updates.merge(char_updates) - - return orphan_annotation unless all_present? updates - - adopt_annotation(updates) + @adoption_assignment = Annotations::AdoptionAssignment.new(**node_updates, **char_updates) end - private + # @!endgroup - # rubocop:disable Metrics/MethodLength - def find_needle_in_haystack(a_haystack, first_index, last_index) + # @param [] node_subset + # @param [Integer] first_index + # @param [Integer] last_index + # @return [Hash] + def find_needle_in_haystack(node_subset, first_index, last_index) updates = { start_node: nil, end_node: nil, } haystack_iterator = 0 - a_haystack.each do |node| + + node_subset.each do |node| node_text_iterator = 0 node_content = collapse(node[:content]) node_content.split("") do @@ -53,15 +142,16 @@ def find_needle_in_haystack(a_haystack, first_index, last_index) haystack_iterator += 1 node_text_iterator += 1 end + ensure + check_execution_time! end updates end - # rubocop:enable Metrics/MethodLength - # rubocop:disable Metrics/AbcSize, Metrics/MethodLength - def maybe_locate_occurence + def maybe_locate_occurrence start_node = text_nodes.find { |node| node[:node_uuid] == annotation.start_node } + unless start_node return { start_node: nil, @@ -85,15 +175,17 @@ def maybe_locate_occurence } end - haystack = text_nodes[start_node_index..end_node_index] + node_subset = text_nodes[start_node_index..end_node_index] first_index = annotation.start_char - 1 last_index = first_index + needle.size - 1 - find_needle_in_haystack(haystack, first_index, last_index) + find_needle_in_haystack(node_subset, first_index, last_index) end end - # rubocop:enable Metrics/AbcSize, Metrics/MethodLength + # @param [Hash] nodes + # @option nodes [String] :start_node + # @option nodes [String] :end_node def place_subject_in_nodes(nodes) start = text_nodes.find { |node| node[:node_uuid] == nodes[:start_node] } final = text_nodes.find { |node| node[:node_uuid] == nodes[:end_node] } @@ -114,116 +206,129 @@ def place_subject_in_nodes(nodes) } end + # @param [Hash] node + # @param [Boolean] end_node + # @see #find_substr_index + # @return [Integer] def place_in_node(node:, end_node: false) # split subject into chars ignoring whitespace - splits = annotation.subject.split(/\s*/) + # escape regex metachars so they match + splits = annotation.subject.split(/\s*/).map { |c| Regexp.escape(c) } splits.reverse! if end_node text_content = node[:content] - node_occurrences = text_content.scan(splits[0]) + + node_occurrences = text_content.scan(/#{splits[0]}/i) # if we can't find the first char, something went wrong; shouldn't happen - return nil if node_occurrences.size.zero? + return nil if node_occurrences.blank? # if the first char is unique in the node, return its index - return text_content.index(splits[0]) if node_occurrences.size == 1 + return text_content.index(splits[0]) if node_occurrences.one? # if it occurs more than once, iteratively expand the search find_substr_index(splits: splits, text_content: text_content, count: 1, from_end: end_node) end - # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity # iterate by character until we find the shortest unique substr at the start or end of the subject in the respective node # return the index of that substr in the node text content - def find_substr_index(splits:, text_content:, count:, from_end: false) - # escape regex metachars so they match - splits = splits.map { |c| metachars.include?(c) ? "\\#{c}" : c } + # + # @param [] splits array of characters to search for + # @param [String] text_content node content to search for the substr within + # @param [Integer] count recursion count for finding the unique occurrence within + # @param [Boolean] from_end whether this should start looking from the end of the textual content / `splits`. + # @param [Regexp] prev_substr the pattern for the previous iteration (@see #substr_regex) + # @raise [Annotations::AdoptOrOrphan::TooManyCandidates] + # @return [Integer] + def find_substr_index( + splits:, text_content:, + count: 1, from_end: false, + prev_substr: substr_regex(splits: splits, count: count - 1, from_end: from_end) + ) + max_count = splits.length + 2 + + raise TooManyCandidates, "max_count attempts exceeded: #{count}" if count > max_count + + check_execution_time! + substr = substr_regex(splits: splits, count: count, from_end: from_end) - node_occurrences = text_content.scan(substr) - prev_substr = substr_regex(splits: splits, count: count - 1, from_end: from_end) + node_occurrences = text_content.scan(substr) if from_end # if we don't find a match, we want the first occurrence of the previous search term # use the match data to find the length of the substr in the text node in case the client added or removed whitespace - return text_content.index(prev_substr) + prev_substr.match(text_content)[0].length - 1 if node_occurrences.size.zero? - return text_content.index(substr) + substr.match(text_content)[0].length - 1 if node_occurrences.size == 1 + return text_content.index(prev_substr) + prev_substr.match(text_content)[0].length - 1 if node_occurrences.empty? + return text_content.index(substr) + substr.match(text_content)[0].length - 1 if node_occurrences.one? else # if we don't find a match, we want the last occurrence of the previous search term - return text_content.rindex(prev_substr) if node_occurrences.size.zero? - return text_content.index(substr) if node_occurrences.size == 1 + return text_content.rindex(prev_substr) if node_occurrences.empty? + return text_content.index(substr) if node_occurrences.one? end - find_substr_index(splits: splits, text_content: text_content, count: count + 1, from_end: from_end) + # Recur with a higher count + find_substr_index(splits: splits, text_content: text_content, count: count + 1, from_end: from_end, prev_substr: substr) end - # rubocop:enable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity - - def substr_regex(splits:, count:, from_end:) - i = 0 - substr = /#{splits[0]}/i - - while i < count - substr = if from_end - /#{splits[i + 1]}[[:space:]]*#{substr}/i - else - /#{substr}[[:space:]]*#{splits[i + 1]}/i - end - i += 1 - end - substr - end + # @param [] splits array of characters to search for + # @param [Integer] count recursion count for finding the unique occurrence within + # @param [Boolean] from_end whether this should start looking from the end of the textual content / `splits`. + # @return [Regexp] pattern for use in {#find_substr_index} + def substr_regex(splits:, count:, from_end: false) + len = count.to_i.clamp(1..) - def metachars - %w[( ) [ ] { } . ? + *] - end + chars = splits.take(len) - def adoptable? - return false unless start_index.present? && final_index.present? + chars.reverse! if from_end - true + /#{chars.join("[[:space:]]*")}/i end - def all_present?(hash) - hash.values.all? + # @!group Utility Methods + + # @raise [Annotations::AdoptOrOrphan::ExecutionTimeExceeded] + # @return [void] + def check_execution_time! + raise ExecutionTimeExceeded if elapsed_time > MAX_EXECUTION_TIME end + # @param [String] input + # @return [String] def collapse(input) input.gsub(/[[:space:]]+/, "") end - def needle - collapse(annotation.subject) + # @return [ActiveSupport::Duration] + def elapsed_time + started_at.present? ? Time.current - started_at : 0 end - memoize def start_index - haystack.index(/#{needle}/i) - end + # @return [Annotation] + def orphan_annotation! + annotation.update! orphaned: true - memoize def final_index - start_index + needle.size - 1 + return annotation end - memoize def occurrences - haystack.scan(/#{needle}/i) + # @param [] prev + # @param [] curr + def repeated_occurrences?(prev, curr) + prev.present? && curr.present? && prev == curr end - def multiple_occurrences? - occurrences.length > 1 - end + # @!endgroup - memoize def haystack - collapse(text_nodes.map { |node| node[:content] }.join) - end + # @abstract + class HaltError < StandardError; end - def orphan_annotation - annotation.assign_attributes orphaned: true - annotation.save - end - - def adopt_annotation(candidates) - annotation.assign_attributes candidates.merge(orphaned: false) - annotation.save - end + # A private error raised when this service has run for too long + # trying to match things. It will short-circuit and orphan the + # annotation in such an event. + class ExecutionTimeExceeded < HaltError; end + # A private error raised when trying to find the candidate + # to remap. For very short annotations, or very common phrases, + # this will bail out instead of infinitely looping. + class TooManyCandidates < HaltError; end end + # rubocop:enable Metrics/AbcSize, Metrics/MethodLength end diff --git a/api/app/services/annotations/adoption_assignment.rb b/api/app/services/annotations/adoption_assignment.rb new file mode 100644 index 0000000000..8984c8478b --- /dev/null +++ b/api/app/services/annotations/adoption_assignment.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +module Annotations + # @api private + # @see Annotations::AdoptOrOrphan + class AdoptionAssignment < ::Types::FlexibleStruct + include ActiveModel::Validations + + # A type for matching {#start_char} and {#end_char} + Index = ::Types::Coercible::Integer.constrained(gteq: 0).fallback { nil } + + # A type for matching {#start_node} and {#end_node} + Node = ::Types::Coercible::String.constrained(filled: true).fallback { nil } + + attribute? :start_node, Node.optional + attribute? :end_node, Node.optional + + attribute? :start_char, Index.optional + attribute? :end_char, Index.optional + + validates :start_node, :end_node, :start_char, :end_char, presence: true + + # @param [Annotation] annotation + # @return [Annotation] + def adopt!(annotation) + annotation.update! to_adopt + + return annotation + end + + # @return [Hash] + def to_adopt + slice(:start_node, :end_node, :start_char, :end_char).merge(orphaned: false) + end + end +end diff --git a/api/config/sidekiq.yml b/api/config/sidekiq.yml index 266b52c172..d9b9151543 100644 --- a/api/config/sidekiq.yml +++ b/api/config/sidekiq.yml @@ -4,3 +4,4 @@ - low_priority - searchkick - ahoy + - annotations