Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add preprocessor for Langmuir collection CSV files #667

Merged
merged 1 commit into from
Oct 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions app/importers/langmuir_preprocessor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# frozen_string_literal: true
require 'csv'

##
# Utility service and methods that merge metadata from a CSV Pull List and MARCXml records
# into a format suitable for ingest by the curate CSV importer

class LangmuirPreprocessor
attr_accessor :processed_csv

##
# Initialize a preprocessor instance by supplying
# @param [String] csv the path to a CSV file containing the expectd Pull List metadata
def initialize(csv)
@source_csv = CSV.read(csv, headers: true)
directory = File.dirname(csv)
extension = File.extname(csv)
filename = File.basename(csv, extension)
@processed_csv = File.join(directory, filename + "-processed.csv")
@tree = {}
end

def record_count
@source_csv.count
end

def additional_headers
['source_row', 'deduplication_key', 'type', 'fileset_label', 'preservation_master_file', 'intermediate_file']
end

# process_source_rows builds
# and
# output_work_tree writes
# a hash of hashes:
# { work_id => {
# :metadata = CSV::Row,
# :filesets => {
# index1 => CSV::Row,
# index2 => CSV::Row,
# etc. for remaining sides/pages
# }
# }
# }
def merge
process_source_rows
output_work_tree
end

def process_source_rows
@source_csv.each.with_index do |row, row_num|
process_row(row, row_num + 2) if row['Digital Object - Parent Identifier'] # skip blank rows in the source csv
end
end

def output_work_tree
merge_csv = CSV.open(@processed_csv, 'w+', headers: true, write_headers: true)
original_headers = @source_csv.headers
merge_csv << additional_headers + original_headers
@tree.each_value do |work|
merge_csv << work[:metadata]
two_sided = work[:filesets].count <= 2
work[:filesets].keys.sort.each do |fileset_index|
fileset = work[:filesets][fileset_index]
fileset['fileset_label'] = make_label(fileset_index, two_sided)
merge_csv << fileset
end
end
merge_csv.close
end

def process_row(row, source_row)
deduplication_key = row['Digital Object - Parent Identifier']
sequence_number, target_file, metadata_row = extract_structure(row)
@tree[deduplication_key] ||= { metadata: nil, filesets: {} } # create a placeholder if we don't have one for this key
@tree[deduplication_key][:metadata] = extract_metadata(row, source_row) if metadata_row
@tree[deduplication_key][:filesets][sequence_number] ||= CSV::Row.new(additional_headers, [source_row, deduplication_key, 'fileset'])
@tree[deduplication_key][:filesets][sequence_number][target_file] = row['Filename']
end

def extract_structure(row)
filename = row['Filename']
p_number = filename.scan(/P0+(\d+)_(ARCH|PROD)/)[0][0].to_i
target_file = filename.include?('ARCH') ? 'preservation_master_file' : 'intermediate_file'
metadata_row = p_number == 1 && target_file == 'preservation_master_file'
[p_number, target_file, metadata_row]
end

def extract_metadata(row, source_row)
deduplication_key = row['Digital Object - Parent Identifier']
processed_row = CSV::Row.new(additional_headers, [source_row, deduplication_key, 'work'])
processed_row << row.to_hash
end

def make_label(side, two_sided)
if two_sided
side == 1 ? 'Front' : 'Back'
else
"Side #{side}"
end
end
end
27 changes: 27 additions & 0 deletions lib/tasks/curate_langmuir.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# frozen_string_literal: true
namespace :curate do
desc "Langmuir preprocessing"
task langmuir: :environment do
langmuir_csv = ENV['CSV'] || ENV['csv'] || ''
valid_args = \
ARGV.length > 1 &&
File.extname(langmuir_csv) == '.csv'
if valid_args
preprocessor = LangmuirPreprocessor.new(langmuir_csv)
preprocessor.merge
puts 'Rows processed: ' + preprocessor.record_count.to_s
puts 'Processed file: ' + File.basename(preprocessor.processed_csv)
else
puts <<~HEREDOC
Langmuir preprocessor

USAGE:
rake curate:langmuir csv=manifest.csv

RETURNS:
manifest-processed.csv in the samve folder as manifest.csv

HEREDOC
end
end
end
Loading