Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor for Rubocop and Allow export by modified FIleSet #120

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 71 additions & 102 deletions lib/chronopolis/exporter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,149 +4,118 @@ def initialize
@logger = Logger.new('log/chronopolis.log')
end

# rubocop:disable Metrics/AbcSize
# rubocop:disable Metrics/MethodLength
def perform_export(pid, include_metadata = true)
@logger.info "PROCESSING PID : #{pid}"
obj = ActiveFedora::Base.find(pid)

steward = steward_from_object(obj)

collection = collection_from_object(obj)
obj_dir = create_object_directory(obj, steward, collection)

@logger.info "Collection for #{pid} is #{collection}"

# mkdir for object
process_file_sets(obj, steward, collection, obj_dir, include_metadata)
write_metadata(obj, steward, collection, obj_dir) if include_metadata
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

write_metadata_file(file_set, metadata_file) if include_metadata is called in process file set. this line saves the collections metadata. Is is not redundant, right?


obj_dir = obj.id + "_" + obj.title.first
obj_dir = obj_dir.truncate(255)
obj_dir = sanitize_filename(obj_dir)
rescue ActiveFedora::ObjectNotFoundError, Ldp::Gone
@logger.error "ERROR: Pid not found #{pid}"
end

FileUtils.mkdir_p File.join('/', 'tdr', 'chronopolis', steward, collection, obj_dir)
private

# mkdirs for filesets
def process_file_sets(obj, steward, collection, obj_dir, include_metadata)
obj.file_sets.each do |file_set|
target_filename = file_set.id + "_" + file_set.title.first
target_filename = target_filename.truncate(255)
target_filename = sanitize_filename(target_filename)
FileUtils.mkdir_p File.join('/', 'tdr', 'chronopolis', steward, collection, obj_dir, target_filename)
target_file = File.join('/', 'tdr', 'chronopolis', steward, collection, obj_dir, target_filename, target_filename)
metadata_file = File.join('/', 'tdr', 'chronopolis', steward, collection, obj_dir, target_filename, "technical_metadata.json")
process_file_set(file_set, steward, collection, obj_dir, include_metadata)
end
end

target_file = check_for_file_extension(target_file)
record = File.new target_file, 'wb'
def process_file_set(file_set, steward, collection, obj_dir, include_metadata)
target_file, metadata_file = prepare_file_paths(file_set, steward, collection, obj_dir)

@logger.info "Writing fileset to #{target_file}"
write_file(file_set, target_file)
write_metadata_file(file_set, metadata_file) if include_metadata
end

record.write file_set.original_file.content
record.flush
record.close
def write_file(file_set, target_file)
File.open(target_file, 'wb') do |file|
@logger.info "Writing fileset to #{target_file}"
file.write(file_set.original_file.content)
end
rescue StandardError => e
@logger.error "ERROR writing file: #{e.message}"
end

next unless include_metadata
json = JSON.parse(file_set.characterization_proxy.metadata.attributes.to_json)
json = JSON.pretty_generate(json)
metadata = File.new metadata_file, "w"
def write_metadata_file(file_set, metadata_file)
metadata = JSON.pretty_generate(file_set.characterization_proxy.metadata.attributes)

File.open(metadata_file, 'w') do |file|
@logger.info "Writing metadata to #{metadata_file}"

metadata.write json
metadata.flush
metadata.close
file.write(metadata)
end
rescue StandardError => e
@logger.error "ERROR writing metadata: #{e.message}"
end

# write out metadata
if include_metadata
json = JSON.parse(obj.to_json)
json = JSON.pretty_generate(json)
metadata_file = File.join('/', 'tdr', 'chronopolis', steward, collection, obj_dir, "metadata.json")
metadata = File.new metadata_file, "w"
def prepare_file_paths(file_set, steward, collection, obj_dir)
sanitized_name = sanitize_filename(file_set.id + "_" + file_set.title.first).truncate(255)
target_dir = File.join('/', 'tdr', 'chronopolis', steward, collection, obj_dir, sanitized_name)
FileUtils.mkdir_p(target_dir)

@logger.info "Writing metadata to #{metadata_file}"
mime_extension = file_set.mime_type&.split('/')&.last
filename_with_extension = [sanitized_name, mime_extension].compact.join('.')
target_file = validate_file_length(File.join(target_dir, filename_with_extension))
metadata_file = File.join(target_dir, "technical_metadata.json")

metadata.write json
metadata.flush
metadata.close
[target_file, metadata_file]
end

def validate_file_length(file_path)
if file_path.length > 254
random_string = SecureRandom.alphanumeric(10)
@logger.error "File path too long: #{file_path.length} characters. Mapping to random string."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably should log the random string incase there is any confusion about which file it is.

File.join(File.dirname(file_path), random_string)
else
file_path
end
rescue ActiveFedora::ObjectNotFoundError
@logger.error "ERROR Pid not found #{pid}"
rescue Ldp::Gone
@logger.error "ERROR Pid not found #{pid}"
end

private
def write_metadata(obj, steward, collection, obj_dir)
metadata_file = File.join('/', 'tdr', 'chronopolis', steward, collection, obj_dir, "metadata.json")
metadata = JSON.pretty_generate(obj.attributes)

def check_for_file_extension(target_file)
if File.extname(target_file) == ""
mime = file_set.mime_type
mime_string = if mime.nil? || mime == ""
""
else
"." + mime.split('/')[1]
end
target_file = File.join('/', 'tdr', 'chronopolis', steward, collection, obj_dir, target_filename, target_filename + mime_string)
File.open(metadata_file, 'w') do |file|
@logger.info "Writing metadata to #{metadata_file}"
file.write(metadata)
end
rescue StandardError => e
@logger.error "ERROR writing metadata: #{e.message}"
end

target_file
def create_object_directory(obj, steward, collection)
obj_dir = sanitize_filename("#{obj.id}_#{obj.title.first}").truncate(255)
obj_dir_path = File.join('/', 'tdr', 'chronopolis', steward, collection, obj_dir)
FileUtils.mkdir_p(obj_dir_path)
obj_dir
end

# rubocop:disable Metrics/CyclomaticComplexity
# rubocop:disable Metrics/PerceivedComplexity
def collection_from_object(obj)
# get collection for object
collections = obj.member_of_collections

return "uncollected" if collections.blank?

collection_titles = []
collection_ids = []

collections.each do |collection_inner|
collection_titles << collection_inner.title.first
collection_ids << collection_inner.id
end

index = collection_titles.index("Collection Descriptions")
saved_index = 0
if !index.nil? && collection_titles.length > 1
collection_titles.delete("Collection Descriptions")
saved_index = index.positive? ? 0 : 1
end
@logger.info "COL TITLES : #{collection_titles}"

index = collection_titles.index("Electronic Theses and Dissertations")
collection = if index.nil?
collections[saved_index].id + "_" + collections[saved_index].title.first
else
collection_ids[index] + "_" + "Electronic Theses and Dissertations"
end
prioritized_collection = prioritize_collection(collections)
sanitize_filename("#{prioritized_collection.id}_#{prioritized_collection.title.first}").truncate(255)
end

collection = collection.truncate(255)
sanitize_filename(collection)
def prioritize_collection(collections)
priority_titles = ["Electronic Theses and Dissertations", "Collection Descriptions"]
collections.find { |c| priority_titles.include?(c.title.first) } || collections.first
end

def steward_from_object(obj)
# get steward for top directory
steward = obj.steward
steward = steward.presence || "no_steward"

steward = obj.steward.presence || "no_steward"
@logger.info "Steward for #{obj.id} is #{steward}"

sanitize_filename(steward)
end

def sanitize_filename(filename)
# Split the name when finding a period which is preceded by some
# character, and is followed by some character other than a period,
# if there is no following period that is followed by something
# other than a period (yeah, confusing, I know)
fn = filename.split(/(?<=.)\.(?=[^.])(?!.*\.[^.])/m)

# We now have one or two parts (depending on whether we could find
# a suitable period). For each of these parts, replace any unwanted
# sequence of characters with an underscore
fn.map! { |s| s.gsub(/[^a-z0-9\-]+/i, '_') }

# Finally, join the parts with a period and return the result
fn.join '.'
filename.gsub(/[^a-z0-9\-_\.]/i, '_')
end
end
85 changes: 80 additions & 5 deletions lib/tasks/chronopolis.rake
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,100 @@ require 'rake'

desc "chronopolis"
task :chronopolis_export_by_id, [:pid] => [:environment] do |_t, args|
exporter = Chronopolis::Exporter.new
pid = args[:pid]
if pid.blank?
Rails.logger.error "PID is required for exporting."
return
end

exporter = Chronopolis::Exporter.new
exporter.perform_export(pid)
end

desc "chronopolis"
task chronopolis: :environment do
exporter = Chronopolis::Exporter.new
CSV.foreach("/usr/local/samvera/epigaea/chronopolis.txt", headers: false, header_converters: :symbol, encoding: "ISO8859-1:utf-8") do |row|
pid = row[0]
process_csv("/usr/local/samvera/epigaea/chronopolis.txt") do |pid|
exporter.perform_export(pid)
end
end

desc "chronopolis export of only binaries"
task chronopolis_binary_only: :environment do
exporter = Chronopolis::Exporter.new
CSV.foreach("/usr/local/samvera/epigaea/chronopolis.txt", headers: false, header_converters: :symbol, encoding: "ISO8859-1:utf-8") do |row|
pid = row[0]
process_csv("/usr/local/samvera/epigaea/chronopolis.txt") do |pid|
exporter.perform_export(pid, false)
end
end

desc 'Run an eDisMax query with parameters'
task :edismax_query, [:start_date] => :environment do |_t, args|
args.with_defaults(start_date: '2023-06-01T00:00:00Z')
exporter = Chronopolis::Exporter.new

solr_url = ActiveFedora::SolrService.instance.conn.uri.to_s
solr = RSolr.connect(url: solr_url)

query_params = construct_main_query(args[:start_date])
response = solr.get('select', params: query_params)
Rails.logger.info "Query executed successfully!"

ids = extract_ids_from_response(response)
Rails.logger.info "Found #{ids.size} IDs to process."

ids.each do |id|
process_member_ids(solr, id, exporter)
end
end

# Helper Methods

def process_csv(file_path)
unless File.exist?(file_path)
Rails.logger.error "File not found: #{file_path}"
return
end

CSV.foreach(file_path, headers: false, header_converters: :symbol, encoding: "ISO8859-1:utf-8") do |row|
pid = row[0]
yield(pid) if block_given?
end
end

def construct_main_query(start_date)
{
'q.alt': "{!term f=has_model_ssim}FileSet",
fq: "system_modified_dtsi:[#{start_date} TO *]",
rows: 1_000_000,
fl: 'id',
defType: 'edismax',
wt: 'json'
}
end

def extract_ids_from_response(response)
response.dig('response', 'docs')&.map { |doc| doc['id'] } || []
end

# rubocop:disable Metrics/MethodLength
def process_member_ids(solr, id, exporter)
member_query_params = {
q: '*:*',
fq: "member_ids_ssim:#{id}",
fl: 'id',
rows: 10,
wt: 'json'
}

member_response = solr.get('select', params: member_query_params)
member_ids = extract_ids_from_response(member_response)

pid = member_ids.first
if pid.present?
Rails.logger.info "Exporting PID: #{pid}"
exporter.perform_export(pid)
else
Rails.logger.warn "No members found for ID: #{id}"
end
end
# rubocop:enable Metrics/MethodLength
Loading