Skip to content

Commit

Permalink
Add preprocessor for Langmuir collection CSV files
Browse files Browse the repository at this point in the history
The preprocessor accepts existing Langmuir CSVs with one file per row and
returns a new csv with each work and fileset in their own row.

* All of the rows for a single work are grouped together
* Each group begins with a row containing the work-level metadata
* All files associated with a fileset are listed in the same row
* Filesets are listed in sequence order and given appropriate labels
* Blank lines in the source CSV are ignored
* The preprocessor adds a deduplication_key field
  • Loading branch information
mark-dce committed Oct 9, 2019
1 parent b2df8c8 commit 478fcd8
Show file tree
Hide file tree
Showing 4 changed files with 234 additions and 0 deletions.
101 changes: 101 additions & 0 deletions app/importers/langmuir_preprocessor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# frozen_string_literal: true
require 'csv'

##
# Utility service and methods that merge metadata from a CSV Pull List and MARCXml records
# into a format suitable for ingest by the curate CSV importer

class LangmuirPreprocessor
attr_accessor :processed_csv

##
# Initialize a preprocessor instance by supplying
# @param [String] csv the path to a CSV file containing the expectd Pull List metadata
def initialize(csv)
@source_csv = CSV.read(csv, headers: true)
directory = File.dirname(csv)
extension = File.extname(csv)
filename = File.basename(csv, extension)
@processed_csv = File.join(directory, filename + "-processed.csv")
@tree = {}
end

def record_count
@source_csv.count
end

def additional_headers
['source_row', 'deduplication_key', 'type', 'fileset_label', 'preservation_master_file', 'intermediate_file']
end

# process_source_rows builds
# and
# output_work_tree writes
# a hash of hashes:
# { work_id => {
# :metadata = CSV::Row,
# :filesets => {
# index1 => CSV::Row,
# index2 => CSV::Row,
# etc. for remaining sides/pages
# }
# }
# }
def merge
process_source_rows
output_work_tree
end

def process_source_rows
@source_csv.each.with_index do |row, row_num|
process_row(row, row_num + 2) if row['Digital Object - Parent Identifier'] # skip blank rows in the source csv
end
end

def output_work_tree
merge_csv = CSV.open(@processed_csv, 'w+', headers: true, write_headers: true)
original_headers = @source_csv.headers
merge_csv << additional_headers + original_headers
@tree.each_value do |work|
merge_csv << work[:metadata]
two_sided = work[:filesets].count <= 2
work[:filesets].keys.sort.each do |fileset_index|
fileset = work[:filesets][fileset_index]
fileset['fileset_label'] = make_label(fileset_index, two_sided)
merge_csv << fileset
end
end
merge_csv.close
end

def process_row(row, source_row)
deduplication_key = row['Digital Object - Parent Identifier']
sequence_number, target_file, metadata_row = extract_structure(row)
@tree[deduplication_key] ||= { metadata: nil, filesets: {} } # create a placeholder if we don't have one for this key
@tree[deduplication_key][:metadata] = extract_metadata(row, source_row) if metadata_row
@tree[deduplication_key][:filesets][sequence_number] ||= CSV::Row.new(additional_headers, [source_row, deduplication_key, 'fileset'])
@tree[deduplication_key][:filesets][sequence_number][target_file] = row['Filename']
end

def extract_structure(row)
filename = row['Filename']
p_number = filename.scan(/P0+(\d+)_(ARCH|PROD)/)[0][0].to_i
target_file = filename.include?('ARCH') ? 'preservation_master_file' : 'intermediate_file'
metadata_row = p_number == 1 && target_file == 'preservation_master_file'
[p_number, target_file, metadata_row]
end

def extract_metadata(row, source_row)
deduplication_key = row['Digital Object - Parent Identifier']
processed_row = CSV::Row.new(additional_headers, [source_row, deduplication_key, 'work'])
processed_row << row.to_hash
end

def make_label(side, two_sided)
if two_sided
side == 1 ? 'Front' : 'Back'
else
"Side #{side}"
end
end
end
27 changes: 27 additions & 0 deletions lib/tasks/curate_langmuir.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# frozen_string_literal: true
namespace :curate do
desc "Langmuir preprocessing"
task langmuir: :environment do
langmuir_csv = ENV['CSV'] || ENV['csv'] || ''
valid_args = \
ARGV.length > 1 &&
File.extname(langmuir_csv) == '.csv'
if valid_args
preprocessor = LangmuirPreprocessor.new(langmuir_csv)
preprocessor.merge
puts 'Rows processed: ' + preprocessor.record_count.to_s
puts 'Processed file: ' + File.basename(preprocessor.processed_csv)
else
puts <<~HEREDOC
Langmuir preprocessor
USAGE:
rake curate:langmuir csv=manifest.csv
RETURNS:
manifest-processed.csv in the samve folder as pull_list.csv
HEREDOC
end
end
end
Loading

0 comments on commit 478fcd8

Please sign in to comment.