Add preprocessor for Langmuir collection CSV files

The preprocessor accepts existing Langmuir CSVs with one file per row and returns a new csv with each work and fileset in their own row. * All of the rows for a single work are grouped together * Each group begins with a row containing the work-level metadata * All files associated with a fileset are listed in the same row * Filesets are listed in sequence order and given appropriate labels * Blank lines in the source CSV are ignored * The preprocessor adds a deduplication_key field
emory-libraries · Oct 9, 2019 · 478fcd8 · 478fcd8
1 parent b2df8c8
commit 478fcd8
Show file tree

Hide file tree

Showing 4 changed files with 234 additions and 0 deletions.
diff --git a/app/importers/langmuir_preprocessor.rb b/app/importers/langmuir_preprocessor.rb
@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+require 'csv'
+
+##
+# Utility service and methods that merge metadata from a CSV Pull List and MARCXml records
+# into a format suitable for ingest by the curate CSV importer
+
+class LangmuirPreprocessor
+  attr_accessor :processed_csv
+
+  ##
+  # Initialize a preprocessor instance by supplying
+  # @param [String] csv the path to a CSV file containing the expectd Pull List metadata
+  def initialize(csv)
+    @source_csv = CSV.read(csv, headers: true)
+    directory = File.dirname(csv)
+    extension = File.extname(csv)
+    filename = File.basename(csv, extension)
+    @processed_csv = File.join(directory, filename + "-processed.csv")
+    @tree = {}
+  end
+
+  def record_count
+    @source_csv.count
+  end
+
+  def additional_headers
+    ['source_row', 'deduplication_key', 'type', 'fileset_label', 'preservation_master_file', 'intermediate_file']
+  end
+
+  # process_source_rows builds
+  # and
+  # output_work_tree writes
+  # a hash of hashes:
+  # { work_id => {
+  #      :metadata = CSV::Row,
+  #      :filesets => {
+  #            index1 => CSV::Row,
+  #            index2 => CSV::Row,
+  #            etc. for remaining sides/pages
+  #            }
+  #      }
+  # }
+  def merge
+    process_source_rows
+    output_work_tree
+  end
+
+  def process_source_rows
+    @source_csv.each.with_index do |row, row_num|
+      process_row(row, row_num + 2) if row['Digital Object - Parent Identifier'] # skip blank rows in the source csv
+    end
+  end
+
+  def output_work_tree
+    merge_csv = CSV.open(@processed_csv, 'w+', headers: true, write_headers: true)
+    original_headers = @source_csv.headers
+    merge_csv << additional_headers + original_headers
+    @tree.each_value do |work|
+      merge_csv << work[:metadata]
+      two_sided = work[:filesets].count <= 2
+      work[:filesets].keys.sort.each do |fileset_index|
+        fileset = work[:filesets][fileset_index]
+        fileset['fileset_label'] = make_label(fileset_index, two_sided)
+        merge_csv << fileset
+      end
+    end
+    merge_csv.close
+  end
+
+  def process_row(row, source_row)
+    deduplication_key = row['Digital Object - Parent Identifier']
+    sequence_number, target_file, metadata_row = extract_structure(row)
+    @tree[deduplication_key] ||= { metadata: nil, filesets: {} } # create a placeholder if we don't have one for this key
+    @tree[deduplication_key][:metadata] = extract_metadata(row, source_row) if metadata_row
+    @tree[deduplication_key][:filesets][sequence_number] ||= CSV::Row.new(additional_headers, [source_row, deduplication_key, 'fileset'])
+    @tree[deduplication_key][:filesets][sequence_number][target_file] = row['Filename']
+  end
+
+  def extract_structure(row)
+    filename = row['Filename']
+    p_number = filename.scan(/P0+(\d+)_(ARCH|PROD)/)[0][0].to_i
+    target_file = filename.include?('ARCH') ? 'preservation_master_file' : 'intermediate_file'
+    metadata_row = p_number == 1 && target_file == 'preservation_master_file'
+    [p_number, target_file, metadata_row]
+  end
+
+  def extract_metadata(row, source_row)
+    deduplication_key = row['Digital Object - Parent Identifier']
+    processed_row = CSV::Row.new(additional_headers, [source_row, deduplication_key, 'work'])
+    processed_row << row.to_hash
+  end
+
+  def make_label(side, two_sided)
+    if two_sided
+      side == 1 ? 'Front' : 'Back'
+    else
+      "Side #{side}"
+    end
+  end
+end
diff --git a/lib/tasks/curate_langmuir.rake b/lib/tasks/curate_langmuir.rake
@@ -0,0 +1,27 @@
+# frozen_string_literal: true
+namespace :curate do
+  desc "Langmuir preprocessing"
+  task langmuir: :environment do
+    langmuir_csv = ENV['CSV'] || ENV['csv'] || ''
+    valid_args = \
+      ARGV.length > 1 &&
+      File.extname(langmuir_csv) == '.csv'
+    if valid_args
+      preprocessor = LangmuirPreprocessor.new(langmuir_csv)
+      preprocessor.merge
+      puts 'Rows processed: ' + preprocessor.record_count.to_s
+      puts 'Processed file: ' + File.basename(preprocessor.processed_csv)
+    else
+      puts <<~HEREDOC
+        Langmuir preprocessor
+
+        USAGE:
+        rake curate:langmuir csv=manifest.csv
+
+        RETURNS:
+        manifest-processed.csv in the samve folder as pull_list.csv
+
+      HEREDOC
+    end
+  end
+end