diff --git a/app/lib/describe_indexer.rb b/app/lib/describe_indexer.rb index d95f442e..a9448303 100644 --- a/app/lib/describe_indexer.rb +++ b/app/lib/describe_indexer.rb @@ -39,20 +39,12 @@ def index end end - # Given a json document, return an XML string that contains - # the JSON blob as a CDATA element + # Converts the JSON payload to XML which is what Traject expects # @param [String] json # @return [String] def prep_for_indexing(json) parsed = JSON.parse(json) - xml = parsed.to_xml - doc = Nokogiri::XML(xml) - collection_node = doc.at('group') - cdata = Nokogiri::XML::CDATA.new(doc, json) - collection_node.add_next_sibling("") - pdc_describe_json_node = doc.at('pdc_describe_json') - pdc_describe_json_node.add_child(cdata) - doc.to_s + parsed.to_xml end def index_one(json) @@ -96,24 +88,35 @@ def perform_indexing urls_to_retry = [] rss_url_list.each do |url| process_url(url) - rescue + rescue => ex + Rails.logger.warn "Indexing: Error importing record from #{url}. Will retry. Exception: #{ex.message}" urls_to_retry << url end # retry an errored urls a second time and send error only if they don't work a second time urls_to_retry.each do |url| + Rails.logger.info "Indexing: Retrying record #{url}." process_url(url) rescue => ex - Rails.logger.warn "Error importing record from #{url}. Exception: #{ex.message}" + Rails.logger.error "Indexing: Error importing record from #{url}. Retry failed. Exception: #{ex.message}" Honeybadger.notify "Error importing record from #{url}. Exception: #{ex.message}" end end def process_url(url) - uri = URI.open(url, open_timeout: 30, read_timeout: 30) + # Bumping the timeout to 60 seconds because datasets with lots of files (e.g. more than 30K files) + # can take a while to be read (for example https://pdc-describe-prod.princeton.edu/describe/works/470.json) + start_read = Time.zone.now + uri = URI.open(url, open_timeout: 60, read_timeout: 60) resource_json = uri.read + elapsed_read = Time.zone.now - start_read + + start_index = Time.zone.now resource_xml = prep_for_indexing(resource_json) traject_indexer.process(resource_xml) - Rails.logger.info "Successfully imported record from #{url}." + elapsed_index = Time.zone.now - start_index + + timing_info = "(read: #{format('%.2f', elapsed_read)} s, index: #{format('%.2f', elapsed_index)} s)" + Rails.logger.info "Indexing: Successfully imported record from #{url}. #{timing_info} " end end diff --git a/config/pdc_discovery.yml b/config/pdc_discovery.yml index c2dc5a13..77c19140 100644 --- a/config/pdc_discovery.yml +++ b/config/pdc_discovery.yml @@ -25,5 +25,6 @@ production: staging: <<: *default - pdc_describe_rss: <%= ENV["PDC_DESCRIBE_RSS"] || "https://pdc-describe-staging.princeton.edu/describe/works.rss" %> + # Notice that we fetch production data for indexing since it is more realistic + pdc_describe_rss: <%= ENV["PDC_DESCRIBE_RSS"] || "https://pdc-describe-prod.princeton.edu/describe/works.rss" %> plausible_site_id: <%= "pdc-discovery-staging.princeton.edu" %> diff --git a/config/schedule.rb b/config/schedule.rb index 21d30e7e..44004928 100644 --- a/config/schedule.rb +++ b/config/schedule.rb @@ -28,7 +28,10 @@ # rake "index:research_data" # end -# Rebuild index completely every 30 minutes while we're doing active data migration -every 30.minutes, roles: [:reindex] do +# Rebuild index completely every 60 minutes +# +# Bumped the schedule to 60 minutes since it's taking close to 30 minutes now that we are +# indexing datasets with very large number of files. +every 60.minutes, roles: [:reindex] do rake "index:research_data" end diff --git a/config/traject/pdc_describe_indexing_config.rb b/config/traject/pdc_describe_indexing_config.rb index 86c5555a..3e99b979 100644 --- a/config/traject/pdc_describe_indexing_config.rb +++ b/config/traject/pdc_describe_indexing_config.rb @@ -12,10 +12,26 @@ provide 'solr.url', Indexing::SolrCloudHelper.collection_writer_url provide 'reader_class_name', 'Traject::NokogiriReader' provide 'solr_writer.commit_on_close', 'true' + + # There are some parameters in Traject that allows us to configure values related + # to the Solr connection, in particular `batch_size` and the `thread_pool`. However, + # given that we are calling traject for each individual record (rather than for a + # batch of records) they might not apply to our scenario. + # + # The documentation is here in case we want to try them out: + # https://www.rubydoc.info/gems/traject/Traject/SolrJsonWriter + provide 'repository', ENV['REPOSITORY_ID'] provide 'logger', Logger.new($stderr, level: Logger::WARN) end +# Converting the XML to JSON is a bit expensive therefore we make that conversion +# only once per record and save it to the context so that we can re-use it. +each_record do |record, context| + xml = record.xpath("/hash").first.to_xml + context.clipboard[:record_json] = Hash.from_xml(xml)["hash"].to_json +end + # ================== # Main fields @@ -25,10 +41,8 @@ accumulator.concat [munged_doi] end -# the element contains a CDATA node with a JSON blob in it -to_field 'pdc_describe_json_ss' do |record, accumulator, _c| - datacite = record.xpath("/hash/pdc_describe_json/text()").first.content - accumulator.concat [datacite] +to_field 'pdc_describe_json_ss' do |_record, accumulator, context| + accumulator.concat [context.clipboard[:record_json]] end # Track the source of this record @@ -99,21 +113,21 @@ end # Extract the author data from the pdc_describe_json and save it on its own field as JSON -to_field 'authors_json_ss' do |record, accumulator, _c| - pdc_json = record.xpath("/hash/pdc_describe_json/text()").first.content +to_field 'authors_json_ss' do |_record, accumulator, context| + pdc_json = context.clipboard[:record_json] authors = JSON.parse(pdc_json).dig("resource", "creators") || [] accumulator.concat [authors.to_json] end -to_field 'authors_orcid_ssim' do |record, accumulator, _c| - pdc_json = record.xpath("/hash/pdc_describe_json/text()").first.content +to_field 'authors_orcid_ssim' do |_record, accumulator, context| + pdc_json = context.clipboard[:record_json] authors_json = JSON.parse(pdc_json).dig("resource", "creators") || [] orcids = authors_json.map { |author| Author.new(author).orcid } accumulator.concat orcids.compact.uniq end -to_field 'authors_affiliation_ssim' do |record, accumulator, _c| - pdc_json = record.xpath("/hash/pdc_describe_json/text()").first.content +to_field 'authors_affiliation_ssim' do |_record, accumulator, context| + pdc_json = context.clipboard[:record_json] authors_json = JSON.parse(pdc_json).dig("resource", "creators") || [] affiliations = authors_json.map { |author| Author.new(author).affiliation_name } accumulator.concat affiliations.compact.uniq @@ -223,6 +237,10 @@ # ================== # Store files metadata as a single JSON string so that we can display detailed information for each of them. +# +# TODO: Note that this information is duplicated with what we save in `pdc_describe_json_ss`. For large +# datasets (e.g. those with 60K files) this is less than ideal. We should look into optimizing +# this when we take care of https://github.com/pulibrary/pdc_discovery/issues/738 to_field 'files_ss' do |record, accumulator, _context| raw_doi = record.xpath("/hash/resource/doi/text()").to_s files = record.xpath("/hash/files/file").map do |file|