diff --git a/requirements/base.in b/requirements/base.in index 320a8a5e0..52d353169 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -14,7 +14,7 @@ gunicorn==19.9.0 jsonfield==2.0.1 logutils==0.3.4.1 lxml==3.7.3 -metsrw==0.3.1 +metsrw==0.3.8 ndg-httpsclient==0.4.2 python-gnupg==0.4.0 python-keystoneclient==3.10.0 diff --git a/requirements/base.txt b/requirements/base.txt index 45ce6b28a..5460646af 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -40,7 +40,7 @@ keystoneauth1==3.14.0 # via python-keystoneclient logutils==0.3.4.1 git+https://github.com/seatme/django-longer-username.git@seatme#egg=longerusername lxml==3.7.3 -metsrw==0.3.1 +metsrw==0.3.8 monotonic==1.5 # via oslo.utils msgpack==0.6.1 # via oslo.serialization mysqlclient==1.4.2.post1 # via agentarchives diff --git a/requirements/local.txt b/requirements/local.txt index e6743cd32..5bdaf010c 100644 --- a/requirements/local.txt +++ b/requirements/local.txt @@ -45,7 +45,7 @@ logutils==0.3.4.1 git+https://github.com/seatme/django-longer-username.git@seatme#egg=longerusername lxml==3.7.3 markupsafe==1.1.1 # via jinja2 -metsrw==0.3.1 +metsrw==0.3.8 monotonic==1.5 msgpack==0.6.1 mysqlclient==1.4.2.post1 diff --git a/requirements/production.txt b/requirements/production.txt index 99db6a4c9..df2c6fb6c 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -41,7 +41,7 @@ keystoneauth1==3.14.0 logutils==0.3.4.1 git+https://github.com/seatme/django-longer-username.git@seatme#egg=longerusername lxml==3.7.3 -metsrw==0.3.1 +metsrw==0.3.8 monotonic==1.5 msgpack==0.6.1 mysqlclient==1.4.2.post1 diff --git a/requirements/test.txt b/requirements/test.txt index c55bb0f19..df77c241b 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -52,7 +52,7 @@ keystoneauth1==3.14.0 logutils==0.3.4.1 git+https://github.com/seatme/django-longer-username.git@seatme#egg=longerusername lxml==3.7.3 -metsrw==0.3.1 +metsrw==0.3.8 mock==3.0.5 # via pytest-mock, vcrpy monotonic==1.5 more-itertools==5.0.0 # via pytest diff --git a/storage_service/common/management/commands/import_aip.py b/storage_service/common/management/commands/import_aip.py index dec6fbd46..026e397fb 100644 --- a/storage_service/common/management/commands/import_aip.py +++ b/storage_service/common/management/commands/import_aip.py @@ -63,7 +63,7 @@ from django.utils.six.moves import input from administration.models import Settings -from common import utils +from common import premis, utils from locations import models @@ -360,7 +360,7 @@ def compress(aip_model_inst, compression_algorithm): """Use the Package model's compress_package method to compress the AIP being imported, update the Package model's ``size`` attribute, retrieve PREMIS agents and event for the compression (using the package model's - ``get_premis_aip_compression_event`` method) and return a 3-tuple: + ``create_premis_aip_compression_event`` method) and return a 3-tuple: (aip_model_inst, compression_event, compression_agents). """ if not compression_algorithm: @@ -379,8 +379,8 @@ def compress(aip_model_inst, compression_algorithm): aip_model_inst.current_path = new_current_path shutil.rmtree(compressed_aip_parent_path) aip_model_inst.size = utils.recalculate_size(new_full_path) - compression_agents = utils.get_ss_premis_agents() - compression_event = aip_model_inst.get_premis_aip_compression_event( + compression_agents = [premis.SS_AGENT] + compression_event = premis.create_premis_aip_compression_event( details["event_detail"], details["event_outcome_detail_note"], agents=compression_agents, diff --git a/storage_service/common/premis.py b/storage_service/common/premis.py new file mode 100644 index 000000000..9e81b7a83 --- /dev/null +++ b/storage_service/common/premis.py @@ -0,0 +1,349 @@ +""" +PREMIS metadata generation. +""" +from __future__ import absolute_import, unicode_literals + +import uuid + +import metsrw +from django.utils import timezone + +from storage_service import __version__ as ss_version + + +PREMIS_META = metsrw.plugins.premisrw.PREMIS_3_0_META +SS_AGENT = metsrw.plugins.premisrw.PREMISAgent( + data=( + "agent", + PREMIS_META, + ( + "agent_identifier", + ("agent_identifier_type", "preservation system"), + ( + "agent_identifier_value", + "Archivematica-Storage-Service-{}".format(ss_version), + ), + ), + ("agent_name", "Archivematica Storage Service"), + ("agent_type", "software"), + ) +) + + +def timestamp(): + return timezone.now().strftime("%Y-%m-%dT%H:%M:%S") + + +def add_agents_to_event_as_list(event, agents): + """Add agents in ``agents`` to the list ``event`` which represents a + PREMIS:EVENT. + :param list event: a PREMIS:EVENT represented as a list + :param iterable agents: an iterable of premisrw.PREMISAgent instances. + """ + for agent in agents: + event.append( + ( + "linking_agent_identifier", + ("linking_agent_identifier_type", agent.identifier_type), + ("linking_agent_identifier_value", agent.identifier_value), + ) + ) + return event + + +def create_replication_event( + original_package_uuid, replica_package_uuid, event_uuid=None, agents=None +): + """Return a PREMISEvent for replication of an AIP. + """ + outcome_detail_note = ( + "Replicated Archival Information Package (AIP) {} by creating" + " replica {}.".format(original_package_uuid, replica_package_uuid) + ) + if not agents: + agents = [SS_AGENT] + if not event_uuid: + event_uuid = str(uuid.uuid4()) + event = [ + "event", + PREMIS_META, + ( + "event_identifier", + ("event_identifier_type", "UUID"), + ("event_identifier_value", event_uuid), + ), + ("event_type", "replication"), + ("event_date_time", timestamp()), + ( + "event_detail_information", + ("event_detail", "Replication of an Archival Information Package"), + ), + ( + "event_outcome_information", + ("event_outcome", "success"), + ( + "event_outcome_detail", + ("event_outcome_detail_note", outcome_detail_note), + ), + ), + ] + event = tuple(add_agents_to_event_as_list(event, agents)) + + return metsrw.plugins.premisrw.PREMISEvent(data=event) + + +def create_premis_aip_creation_event( + package_uuid, master_aip_uuid=None, agents=None, inst=True +): + """Return a PREMISEvent for creation of an AIP.""" + if master_aip_uuid: + outcome_detail_note = ( + "Created Archival Information Package (AIP) {} by replicating" + " previously created AIP {}".format(package_uuid, master_aip_uuid) + ) + else: + outcome_detail_note = "Created Archival Information Package (AIP) {}".format( + package_uuid + ) + if not agents: + agents = [SS_AGENT] + event = [ + "event", + PREMIS_META, + ( + "event_identifier", + ("event_identifier_type", "UUID"), + ("event_identifier_value", str(uuid.uuid4())), + ), + # Question: use the more specific 'information package creation' + # PREMIS event? + ("event_type", "creation"), + ("event_date_time", timestamp()), + ( + "event_detail_information", + ("event_detail", "Creation of an Archival Information Package"), + ), + ( + "event_outcome_information", + ("event_outcome", "success"), + ( + "event_outcome_detail", + ("event_outcome_detail_note", outcome_detail_note), + ), + ), + ] + event = tuple(add_agents_to_event_as_list(event, agents)) + + return metsrw.plugins.premisrw.PREMISEvent(data=event) + + +def create_premis_aip_compression_event( + event_detail, event_outcome_detail_note, agents=None +): + """Return a PREMISEvent describing the compression of an AIP.""" + if not agents: + agents = [SS_AGENT] + event = [ + "event", + PREMIS_META, + ( + "event_identifier", + ("event_identifier_type", "UUID"), + ("event_identifier_value", str(uuid.uuid4())), + ), + ("event_type", "compression"), + ("event_date_time", timestamp()), + ("event_detail_information", ("event_detail", event_detail)), + ( + "event_outcome_information", + ("event_outcome", "success"), + ( + "event_outcome_detail", + ("event_outcome_detail_note", event_outcome_detail_note), + ), + ), + ] + event = tuple(add_agents_to_event_as_list(event, agents)) + + return metsrw.plugins.premisrw.PREMISEvent(data=event) + + +def create_replication_validation_event( + replica_package_uuid, + checksum_report, + master_aip_uuid, + fixity_report=None, + agents=None, +): + """Return a PREMISEvent for validation of AIP replication. + """ + success = checksum_report["success"] + if fixity_report: + success = fixity_report["success"] and success + outcome = success and "success" or "failure" + detail = ( + "Validated the replication of Archival Information Package (AIP)" + " {master_aip_uuid} to replica AIP {replica_aip_uuid}".format( + master_aip_uuid=master_aip_uuid, replica_aip_uuid=replica_package_uuid + ) + ) + if fixity_report: + detail += " by performing a BagIt fixity check and by comparing" " checksums" + outcome_detail_note = "{}\n{}".format( + fixity_report["message"], checksum_report["message"] + ) + else: + detail += " by comparing checksums" + outcome_detail_note = checksum_report["message"] + if not agents: + agents = [SS_AGENT] + event = [ + "event", + PREMIS_META, + ( + "event_identifier", + ("event_identifier_type", "UUID"), + ("event_identifier_value", str(uuid.uuid4())), + ), + ("event_type", "validation"), + ("event_date_time", timestamp()), + ("event_detail_information", ("event_detail", detail)), + ( + "event_outcome_information", + ("event_outcome", outcome), + ( + "event_outcome_detail", + ("event_outcome_detail_note", outcome_detail_note), + ), + ), + ] + event = tuple(add_agents_to_event_as_list(event, agents)) + + return metsrw.plugins.premisrw.PREMISEvent(data=event) + + +def create_replication_derivation_relationship( + related_aip_uuid, replication_event_uuid, premis_version=None +): + """Return a PREMIS relationship of type derivation relating an implicit + PREMIS object (an AIP) to some to related AIP (with UUID + ``related_aip_uuid``) via a replication event with UUID + ``replication_event_uuid``. Note the complication wherein PREMIS v. 2.2 + uses 'Identification' where PREMIS v. 3.0 uses 'Identifier'. + """ + if not premis_version: + premis_version = PREMIS_META["version"] + related_object_identifier = {"2.2": "related_object_identification"}.get( + premis_version, "related_object_identifier" + ) + related_event_identifier = {"2.2": "related_event_identification"}.get( + premis_version, "related_event_identifier" + ) + return ( + "relationship", + ("relationship_type", "derivation"), + ("relationship_sub_type", ""), + ( + related_object_identifier, + ("related_object_identifier_type", "UUID"), + ("related_object_identifier_value", related_aip_uuid), + ), + ( + related_event_identifier, + ("related_event_identifier_type", "UUID"), + ("related_event_identifier_value", replication_event_uuid), + ), + ) + + +def create_aip_premis_object( + package_uuid, + package_size, + package_extension, + message_digest_algorithm, + message_digest, + archive_tool, + compression_program_version, + composition_level=1, + premis_relationships=None, +): + """Return a element for this package's (AIP's) pointer + file. + :param str package_uuid: unique identifier for the PREMIS object + :param str package_size: size of object in bytes + :param str package_extension: object file extension, e.g. .7z + :param str message_digest_algorithm: name of the algorithm used to generate + ``message_digest``. + :param str message_digest: hex string checksum for the + packaged/compressed AIP. + :param str archive_tool: name of the tool (program) used to compress + the AIP, e.g., '7-Zip'. + :param str compression_program_version: version of ``archive_tool`` + used. + :keyword int composition_level: PREMIS composition level (e.g. 2) + :returns: as a tuple. + """ + # PRONOM ID and PRONOM name for each file extension + pronom_conversion = { + ".7z": {"puid": "fmt/484", "name": "7Zip format"}, + ".bz2": {"puid": "x-fmt/268", "name": "BZIP2 Compressed Archive"}, + } + premis_relationships = premis_relationships or [] + kwargs = dict( + xsi_type="premis:file", + identifier_value=package_uuid, + message_digest_algorithm=message_digest_algorithm, + message_digest=message_digest, + size=str(package_size), + creating_application_name=archive_tool, + creating_application_version=compression_program_version, + date_created_by_application=timestamp(), + relationship=premis_relationships, + premis_version=PREMIS_META["version"], + composition_level=str(composition_level), + ) + try: + kwargs.update( + { + "format_name": pronom_conversion[package_extension]["name"], + "format_registry_name": "PRONOM", + "format_registry_key": pronom_conversion[package_extension]["puid"], + } + ) + except KeyError: + pass + + return metsrw.plugins.premisrw.PREMISObject(**kwargs) + + +def create_encryption_event(encr_result, key_fingerprint, gpg_version): + """Return a PREMIS:EVENT for the encryption event.""" + detail = "program=GPG; version={}; key={}".format(gpg_version, key_fingerprint) + outcome_detail_note = 'Status="{}"; Standard Error="{}"'.format( + encr_result.status.replace('"', r"\""), + encr_result.stderr.replace('"', r"\"").strip(), + ) + agents = [SS_AGENT] + event = [ + "event", + PREMIS_META, + ( + "event_identifier", + ("event_identifier_type", "UUID"), + ("event_identifier_value", str(uuid.uuid4())), + ), + ("event_type", "encryption"), + ("event_date_time", timestamp()), + ("event_detail_information", ("event_detail", detail)), + ( + "event_outcome_information", + ("event_outcome", "success"), + ( + "event_outcome_detail", + ("event_outcome_detail_note", outcome_detail_note), + ), + ), + ] + event = tuple(add_agents_to_event_as_list(event, agents)) + + return metsrw.plugins.premisrw.PREMISEvent(data=event) diff --git a/storage_service/common/tests/test_premis.py b/storage_service/common/tests/test_premis.py new file mode 100644 index 000000000..53d51c8e1 --- /dev/null +++ b/storage_service/common/tests/test_premis.py @@ -0,0 +1,35 @@ +from collections import namedtuple + +from common import premis +from storage_service import __version__ as ss_version + + +FakeGPGRet = namedtuple("FakeGPGRet", "ok status stderr") +GPG_VERSION = "1.4.16" +SUCCESS_STATUS = "good times" +SOME_FINGERPRINT = "B9C518917A958DD0B1F5E1B80C3D34DDA5958532" + + +def test_create_encryption_event(): + stderr = 'me contain " quote' + encr_result = FakeGPGRet(ok=True, status=SUCCESS_STATUS, stderr=stderr) + event = premis.create_encryption_event( + encr_result, SOME_FINGERPRINT, GPG_VERSION + ).data + assert event[0] == "event" + event = event[2:] + assert [x for x in event if x[0] == "event_type"][0][1] == "encryption" + assert [x for x in event if x[0] == "event_detail_information"][0][1][1] == ( + "program=GPG; version={}; key={}".format(GPG_VERSION, SOME_FINGERPRINT) + ) + eoi = [x for x in event if x[0] == "event_outcome_information"][0] + assert [x for x in eoi if x[0] == "event_outcome"][0][1] == "success" + assert [x for x in eoi if x[0] == "event_outcome_detail"][0][1][1] == ( + 'Status="{}"; Standard Error="{}"'.format( + SUCCESS_STATUS, stderr.replace('"', r"\"") + ) + ) + lai = [x for x in event if x[0] == "linking_agent_identifier"][0] + assert [x for x in lai if x[0] == "linking_agent_identifier_value"][0][1] == ( + "Archivematica-Storage-Service-{}".format(ss_version) + ) diff --git a/storage_service/common/utils.py b/storage_service/common/utils.py index 22fb7f837..198974387 100644 --- a/storage_service/common/utils.py +++ b/storage_service/common/utils.py @@ -10,8 +10,6 @@ import shutil import uuid -from metsrw.plugins import premisrw - from django.core.exceptions import ObjectDoesNotExist from django import http from django.utils.translation import ugettext as _ @@ -351,63 +349,6 @@ def coerce_str(string): return string -def add_agents_to_event_as_list(event, agents): - """Add agents in ``agents`` to the list ``event`` which represents a - PREMIS:EVENT. - :param list event: a PREMIS:EVENT represented as a list - :param iterable agents: an iterable of premisrw.PREMISAgent instances. - """ - for agent in agents: - event.append( - ( - "linking_agent_identifier", - ("linking_agent_identifier_type", agent.identifier_type), - ("linking_agent_identifier_value", agent.identifier_value), - ) - ) - return event - - -def mets_file_now(): - return datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") - - -def get_ss_premis_agents(inst=True): - """Return PREMIS agents for preservation events performed by the - Storage Service. - Note: Archivematica returns a 'repository code'-type agent while we - are currently just returning a 'preservation system' one. What is - the desired behaviour here? AM's agents used for compression from - db:: - - +----+-----------------------+----------------------+------------------------------------------------------+--------------------+ - | pk | agentIdentifierType | agentIdentifierValue | agentName | agentType | - +----+-----------------------+----------------------+------------------------------------------------------+--------------------+ - | 1 | preservation system | Archivematica-1.7 | Archivematica | software | - | 2 | repository code | test | test | organization | - +----+-----------------------+----------------------+------------------------------------------------------+--------------------+ - """ - agents = [ - ( - "agent", - premisrw.PREMIS_META, - ( - "agent_identifier", - ("agent_identifier_type", "preservation system"), - ( - "agent_identifier_value", - "Archivematica-Storage-Service-{}".format(ss_version), - ), - ), - ("agent_name", "Archivematica Storage Service"), - ("agent_type", "software"), - ) - ] - if inst: - return [premisrw.PREMISAgent(data=data) for data in agents] - return agents - - StorageEffects = namedtuple( "StorageEffects", ["events", "composition_level_updater", "inhibitors"] ) diff --git a/storage_service/locations/models/gpg.py b/storage_service/locations/models/gpg.py index f54a0d2f0..d6aad835c 100644 --- a/storage_service/locations/models/gpg.py +++ b/storage_service/locations/models/gpg.py @@ -7,9 +7,6 @@ import shutil import subprocess import tarfile -from uuid import uuid4 - -from metsrw.plugins import premisrw # Core Django, alphabetical from django.conf import settings @@ -20,8 +17,7 @@ # Third party dependencies, alphabetical # This project, alphabetical -from common import utils -from common import gpgutils +from common import gpgutils, premis, utils # This module, alphabetical from .location import Location @@ -184,7 +180,9 @@ def composition_level_updater(existing_composition_level): ("inhibitor_type", "GPG"), ("inhibitor_target", "All content"), ) - encryption_event = create_encryption_event(encr_result, key_fingerprint) + encryption_event = premis.create_encryption_event( + encr_result, key_fingerprint, _get_gpg_version() + ) return utils.StorageEffects( events=[encryption_event], composition_level_updater=composition_level_updater, @@ -293,16 +291,6 @@ def _encr_path2key_fingerprint(encr_path): raise GPGException(fail_msg) -# This replaces non-unicode characters with a replacement character, -# and is primarily used for arbitrary strings (e.g. filenames, paths) -# that might not be valid unicode to begin with. -# NOTE: non-DRY from archivematicaCommon/archivematicaFunctions.py -def escape(string): - if isinstance(string, str): - string = string.decode("utf-8", errors="replace") - return string - - def _abort_create_tar(path, tarpath): fail_msg = _( "Failed to create a tarfile at %(tarpath)s for dir at %(path)s" @@ -379,41 +367,6 @@ def _get_gpg_version(): return _parse_gpg_version(gpgutils.gpg().version) -def create_encryption_event(encr_result, key_fingerprint): - """Return a PREMIS:EVENT for the encryption event.""" - gpg_version = _get_gpg_version() - detail = escape( - "program=GPG; version={}; key={}".format(gpg_version, key_fingerprint) - ) - outcome_detail_note = 'Status="{}"; Standard Error="{}"'.format( - encr_result.status.replace('"', r"\""), - encr_result.stderr.replace('"', r"\"").strip(), - ) - agents = utils.get_ss_premis_agents() - event = [ - "event", - premisrw.PREMIS_META, - ( - "event_identifier", - ("event_identifier_type", "UUID"), - ("event_identifier_value", str(uuid4())), - ), - ("event_type", "encryption"), - ("event_date_time", utils.mets_file_now()), - ("event_detail", detail), - ( - "event_outcome_information", - ("event_outcome", "success"), - ( - "event_outcome_detail", - ("event_outcome_detail_note", outcome_detail_note), - ), - ), - ] - event = tuple(utils.add_agents_to_event_as_list(event, agents)) - return premisrw.PREMISEvent(data=event) - - def _gpg_decrypt(path): """Use GnuPG to decrypt the file at ``path`` and then delete the encrypted file. diff --git a/storage_service/locations/models/package.py b/storage_service/locations/models/package.py index 9de92d06d..55927d7c6 100644 --- a/storage_service/locations/models/package.py +++ b/storage_service/locations/models/package.py @@ -18,7 +18,6 @@ from django.conf import settings from django.db import models from django.utils.translation import ugettext_lazy as _ -from django.utils import timezone # Third party dependencies, alphabetical import bagit @@ -29,7 +28,7 @@ import requests # This project, alphabetical -from common import utils +from common import premis, utils from locations import signals # This module, alphabetical @@ -619,8 +618,10 @@ def replicate(self, replicator_location_uuid): replica_package.uuid, master_checksum_algorithm, ) - replication_validation_event = replica_package.get_replication_validation_event( - checksum_report=checksum_report, master_aip_uuid=self.uuid + replication_validation_event = premis.create_replication_validation_event( + replica_package.uuid, + checksum_report=checksum_report, + master_aip_uuid=self.uuid, ) # Create and write to disk the pointer file for the replica, which @@ -1010,8 +1011,15 @@ def _create_pointer_file_write_to_disk( __, compression_program_version, archive_tool = _get_compression_details_from_premis_events( premis_events, self.uuid ) - premis_object = self._create_aip_premis_object( - checksum_algorithm, checksum, archive_tool, compression_program_version + __, extension = os.path.splitext(self.current_path) + premis_object = premis.create_aip_premis_object( + self.uuid, + self.size, + extension, + checksum_algorithm, + checksum, + archive_tool, + compression_program_version, ) pointer_file = self.create_pointer_file( premis_object, @@ -1089,7 +1097,8 @@ def create_replica_pointer_file( >>> __, fixity_report = ( replica_package.get_fixity_check_report_send_signals()) >>> replication_validation_event = ( - replica_package.get_replication_validation_event( + create_replication_validation_event( + replica_package.uuid, checksum_report=checksum_report, master_aip_uuid=self.uuid, fixity_report=fixity_report, @@ -1135,24 +1144,30 @@ def create_replica_pointer_file( master_premis_agents = master_ptr_aip_fsentry.get_premis_agents() # 3. Construct the pointer file and return it - replica_premis_creation_agents = utils.get_ss_premis_agents() + replica_premis_creation_agents = [premis.SS_AGENT] __, compression_program_version, archive_tool = ( master_compression_event.compression_details ) replica_premis_relationships = [ - _get_replication_derivation_relationship( + premis.create_replication_derivation_relationship( master_aip_uuid, replication_event_uuid ) ] - replica_premis_object = replica_package._create_aip_premis_object( + __, extension = os.path.splitext(replica_package.current_path) + replica_premis_object = premis.create_aip_premis_object( + replica_package.uuid, + replica_package.size, + extension, master_checksum_algorithm, master_checksum, archive_tool, compression_program_version, premis_relationships=replica_premis_relationships, ) - replica_premis_creation_event = replica_package.get_premis_aip_creation_event( - master_aip_uuid=master_aip_uuid, agents=replica_premis_creation_agents + replica_premis_creation_event = premis.create_premis_aip_creation_event( + replica_package.uuid, + master_aip_uuid=master_aip_uuid, + agents=replica_premis_creation_agents, ) replica_premis_agents = list( set(master_premis_agents + replica_premis_creation_agents) @@ -1188,12 +1203,15 @@ def create_new_pointer_file_with_replication( old_premis_object = old_fsentry.get_premis_objects()[0] old_premis_events = old_fsentry.get_premis_events() old_premis_agents = old_fsentry.get_premis_agents() - ss_agents = utils.get_ss_premis_agents() - replication_event = self.create_replication_event( - replica_package, event_uuid=replication_event_uuid, agents=ss_agents + ss_agents = [premis.SS_AGENT] + replication_event = premis.create_replication_event( + self.uuid, + replica_package.uuid, + event_uuid=replication_event_uuid, + agents=ss_agents, ) old_premis_events.append(replication_event) - replication_relationship = _get_replication_derivation_relationship( + replication_relationship = premis.create_replication_derivation_relationship( replica_package.uuid, replication_event_uuid ) new_relationships = old_premis_object.findall("relationship") @@ -1228,7 +1246,7 @@ def create_new_pointer_file_given_storage_effects( old_premis_events = old_fsentry.get_premis_events() old_premis_agents = old_fsentry.get_premis_agents() new_premis_events = list(set(old_premis_events + storage_effects.events)) - new_premis_agents = list(set(old_premis_agents + utils.get_ss_premis_agents())) + new_premis_agents = list(set(old_premis_agents + [premis.SS_AGENT])) new_composition_level = old_composition_level if storage_effects.composition_level_updater: new_composition_level = storage_effects.composition_level_updater( @@ -1258,189 +1276,6 @@ def create_new_pointer_file_given_storage_effects( package_subtype=package_subtype, ) - def create_replication_event( - self, replica_package, event_uuid=None, agents=None, inst=True - ): - """Return a PREMIS:EVENT for replication of an AIP, as a - premisrw.PREMISEvent or, if ``inst`` is ``False``, as a python - Python tuple. - """ - outcome_detail_note = ( - "Replicated Archival Information Package (AIP) {} by creating" - " replica {}.".format(self.uuid, replica_package.uuid) - ) - if not agents: - agents = utils.get_ss_premis_agents() - if not event_uuid: - event_uuid = str(uuid4()) - event = [ - "event", - premisrw.PREMIS_META, - ( - "event_identifier", - ("event_identifier_type", "UUID"), - ("event_identifier_value", event_uuid), - ), - ("event_type", "replication"), - ("event_date_time", utils.mets_file_now()), - ("event_detail", "Replication of an Archival Information Package"), - ( - "event_outcome_information", - ("event_outcome", "success"), - ( - "event_outcome_detail", - ("event_outcome_detail_note", outcome_detail_note), - ), - ), - ] - event = tuple(utils.add_agents_to_event_as_list(event, agents)) - if inst: - return premisrw.PREMISEvent(data=event) - return event - - def get_premis_aip_creation_event( - self, master_aip_uuid=None, agents=None, inst=True - ): - """Return a PREMIS:EVENT for creation of an AIP as a Python tuple.""" - if master_aip_uuid: - outcome_detail_note = ( - "Created Archival Information Package (AIP) {} by replicating" - " previously created AIP {}".format(self.uuid, master_aip_uuid) - ) - else: - outcome_detail_note = "Created Archival Information Package (AIP) {}".format( - self.uuid - ) - if not agents: - agents = utils.get_ss_premis_agents() - event = [ - "event", - premisrw.PREMIS_META, - ( - "event_identifier", - ("event_identifier_type", "UUID"), - ("event_identifier_value", str(uuid4())), - ), - # Question: use the more specific 'information package creation' - # PREMIS event? - ("event_type", "creation"), - ("event_date_time", utils.mets_file_now()), - ("event_detail", "Creation of an Archival Information Package"), - ( - "event_outcome_information", - ("event_outcome", "success"), - ( - "event_outcome_detail", - ("event_outcome_detail_note", outcome_detail_note), - ), - ), - ] - event = tuple(utils.add_agents_to_event_as_list(event, agents)) - if inst: - return premisrw.PREMISEvent(data=event) - return event - - @staticmethod - def get_compression_event_detail(compression_algorithm): - """Return an eventDetail for the event of compressing an AIP using the - supplied algorithm. - """ - # TODO: the program should be supplied by the caller - program = { - utils.COMPRESSION_TAR: "tar", - utils.COMPRESSION_TAR_BZIP2: "tar", - }.get(compression_algorithm, "7z") - return 'program={}; algorithm="{}"'.format(program, compression_algorithm) - - def get_premis_aip_compression_event( - self, event_detail, event_outcome_detail_note, agents=None, inst=True - ): - """Return a PREMIS:EVENT describing the compression of an AIP.""" - if not agents: - agents = utils.get_ss_premis_agents() - event = [ - "event", - premisrw.PREMIS_META, - ( - "event_identifier", - ("event_identifier_type", "UUID"), - ("event_identifier_value", str(uuid4())), - ), - ("event_type", "compression"), - ("event_date_time", utils.mets_file_now()), - ("event_detail", event_detail), - ( - "event_outcome_information", - ("event_outcome", "success"), - ( - "event_outcome_detail", - ("event_outcome_detail_note", event_outcome_detail_note), - ), - ), - ] - event = tuple(utils.add_agents_to_event_as_list(event, agents)) - if inst: - return premisrw.PREMISEvent(data=event) - return event - - def get_replication_validation_event( - self, - checksum_report, - master_aip_uuid, - fixity_report=None, - agents=None, - inst=True, - ): - """Return a PREMIS:EVENT (as a tuple) for validation of AIP - replication. - """ - success = checksum_report["success"] - if fixity_report: - success = fixity_report["success"] and success - outcome = success and "success" or "failure" - detail = ( - "Validated the replication of Archival Information Package (AIP)" - " {master_aip_uuid} to replica AIP {replica_aip_uuid}".format( - master_aip_uuid=master_aip_uuid, replica_aip_uuid=self.uuid - ) - ) - if fixity_report: - detail += ( - " by performing a BagIt fixity check and by comparing" " checksums" - ) - outcome_detail_note = "{}\n{}".format( - fixity_report["message"], checksum_report["message"] - ) - else: - detail += " by comparing checksums" - outcome_detail_note = checksum_report["message"] - if not agents: - agents = utils.get_ss_premis_agents() - event = [ - "event", - premisrw.PREMIS_META, - ( - "event_identifier", - ("event_identifier_type", "UUID"), - ("event_identifier_value", str(uuid4())), - ), - ("event_type", "validation"), - ("event_date_time", utils.mets_file_now()), - ("event_detail", detail), - ( - "event_outcome_information", - ("event_outcome", outcome), - ( - "event_outcome_detail", - ("event_outcome_detail_note", outcome_detail_note), - ), - ), - ] - event = tuple(utils.add_agents_to_event_as_list(event, agents)) - if inst: - return premisrw.PREMISEvent(data=event) - return event - def create_pointer_file( self, premis_object, @@ -1551,56 +1386,6 @@ def create_pointer_file( LOGGER.info("Returning pointer file for: %s", self.uuid) return pointer_file - def _create_aip_premis_object( - self, - message_digest_algorithm, - message_digest, - archive_tool, - compression_program_version, - premis_relationships=None, - ): - """Return a element for this package's (AIP's) pointer - file. - :param str message_digest_algorithm: name of the algorithm used to generate - ``message_digest``. - :param str message_digest: hex string checksum for the - packaged/compressed AIP. - :param str archive_tool: name of the tool (program) used to compress - the AIP, e.g., '7-Zip'. - :param str compression_program_version: version of ``archive_tool`` - used. - :returns: as a tuple. - """ - # PRONOM ID and PRONOM name for each file extension - pronom_conversion = { - ".7z": {"puid": "fmt/484", "name": "7Zip format"}, - ".bz2": {"puid": "x-fmt/268", "name": "BZIP2 Compressed Archive"}, - } - __, extension = os.path.splitext(self.current_path) - now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S") # YYYY-MM-DDTHH:MM:SS - premis_relationships = premis_relationships or [] - kwargs = dict( - xsi_type="premis:file", - identifier_value=self.uuid, - message_digest_algorithm=message_digest_algorithm, - message_digest=message_digest, - size=str(self.size), - creating_application_name=archive_tool, - creating_application_version=compression_program_version, - date_created_by_application=now, - relationship=premis_relationships, - ) - try: - kwargs.update( - { - "format_name": pronom_conversion[extension]["name"], - "format_registry_key": pronom_conversion[extension]["puid"], - } - ) - except KeyError: - pass - return premisrw.PREMISObject(**kwargs) - def create_replicas(self): """Create replicas of this AIP in any replicator locations. @@ -2855,9 +2640,9 @@ def _process_pointer_file_for_reingest( """ if to_be_compressed: # Update pointer file - root = etree.parse(self.full_pointer_file_path) + mets = metsrw.METSDocument.fromfile(self.full_pointer_file_path) + aip = mets.get_file(type="Archival Information Package") # Add compression event (if compressed) - amdsec = root.find("mets:amdSec", namespaces=utils.NSMAP) if compression in (utils.COMPRESSION_7Z_BZIP, utils.COMPRESSION_7Z_LZMA): try: version = [ @@ -2882,13 +2667,11 @@ def _process_pointer_file_for_reingest( " update pointer file" ) event_detail = _("Unknown compression") - utils.mets_add_event( - amdsec, - "compression", - event_detail=event_detail, - event_outcome_detail_note="", + compression_event = premis.create_premis_aip_compression_event( + event_detail, "" ) - self._update_pointer_file(compression, root=root, path=updated_aip_path) + aip.add_premis_event(compression_event) + self._update_pointer_file(compression, mets, path=updated_aip_path) elif was_compressed: # AIP used to be compressed, but is no longer so delete pointer file os.remove(self.full_pointer_file_path) @@ -2899,162 +2682,108 @@ def _process_pointer_file_for_reingest( # END Private methods for ``finish_reingest`` # ========================================================================== - def _update_pointer_file(self, compression, root=None, path=None): + def _update_pointer_file(self, compression, mets, path=None): """Update the AIP's pointer file at the end of re-ingest.""" LOGGER.debug("Updating pointer file at %s", self.full_pointer_file_path) - if not root: - root = etree.parse(self.full_pointer_file_path) if not path: path = self.fetch_local_path() - # Update FLocat to full path - file_ = root.find( - './/mets:fileGrp[@USE="Archival Information Package"]/mets:file', - namespaces=utils.NSMAP, - ) - flocat = file_.find( - 'mets:FLocat[@OTHERLOCTYPE="SYSTEM"][@LOCTYPE="OTHER"]', - namespaces=utils.NSMAP, - ) - flocat.set(utils.PREFIX_NS["xlink"] + "href", self.full_path) + aip = mets.get_file(type="Archival Information Package") + premis_obj = aip.get_premis_objects()[0] + + premis3_nsmap = utils.NSMAP.copy() + premis3_nsmap.update(premisrw.PREMIS_3_0_NAMESPACES) - # Update fixity checksum - fixity_elem = root.find(".//premis:fixity", namespaces=utils.NSMAP) - algorithm = fixity_elem.findtext( - "premis:messageDigestAlgorithm", namespaces=utils.NSMAP - ) try: - checksum = utils.generate_checksum(path, algorithm) + checksum = utils.generate_checksum( + path, premis_obj.message_digest_algorithm + ) except ValueError: # If incorrectly parsed algorithm, default to sha512, since that is # what AM uses checksum = utils.generate_checksum(path, "sha512") - fixity_elem.find( - "premis:messageDigest", namespaces=utils.NSMAP - ).text = checksum.hexdigest() - # Update size - root.find(".//premis:size", namespaces=utils.NSMAP).text = str( - os.path.getsize(path) - ) + aip = mets.get_file(type="Archival Information Package") + aip.path = self.full_path - # Set compression-related data - transform_order = 1 - decr_transform_file = file_.find( - './/mets:transformFile[@TRANSFORMTYPE="decryption"]', namespaces=utils.NSMAP + transform_types = set( + [transform.get("type") for transform in aip.transform_files] ) - if decr_transform_file is not None: + if "decryption" in transform_types: transform_order = 2 # encryption is a prior transformation + else: + transform_order = 1 - transform_file = [] if compression in (utils.COMPRESSION_7Z_BZIP, utils.COMPRESSION_7Z_LZMA): if compression == utils.COMPRESSION_7Z_BZIP: algo = "bzip2" elif compression == utils.COMPRESSION_7Z_LZMA: algo = "lzma" - transform_file.append( - etree.Element( - utils.PREFIX_NS["mets"] + "transformFile", - TRANSFORMORDER=str(transform_order), - TRANSFORMTYPE="decompression", - TRANSFORMALGORITHM=algo, - ) + aip.transform_files.append( + { + "algorithm": algo, + "order": str(transform_order), + "type": "decompression", + } ) version = [ x for x in subprocess.check_output("7z").splitlines() if "Version" in x ][0] - format_info = { - "name": "7Zip format", - "registry_name": "PRONOM", - "registry_key": "fmt/484", - "program_name": "7-Zip", - "program_version": version, - } + extension = ".7z" + program_name = "7-Zip" elif compression in (utils.COMPRESSION_TAR_BZIP2, utils.COMPRESSION_TAR): if compression == utils.COMPRESSION_TAR_BZIP2: - transform_file.append( - etree.Element( - utils.PREFIX_NS["mets"] + "transformFile", - TRANSFORMORDER=str(transform_order), - TRANSFORMTYPE="decompression", - TRANSFORMALGORITHM="bzip2", - ) + aip.transform_files.append( + { + "algorithm": "bzip2", + "order": str(transform_order), + "type": "decompression", + } ) transform_order += 1 - transform_file.append( - etree.Element( - utils.PREFIX_NS["mets"] + "transformFile", - TRANSFORMORDER=str(transform_order), - TRANSFORMTYPE="decompression", - TRANSFORMALGORITHM="tar", - ) + aip.transform_files.append( + { + "algorithm": "tar", + "order": str(transform_order), + "type": "decompression", + } ) version = subprocess.check_output(["tar", "--version"]).splitlines()[0] - format_info = { - "name": "BZIP2 Compressed Archive", - "registry_name": "PRONOM", - "registry_key": "x-fmt/268", - "program_name": "tar", - "program_version": version, - } + extension = ".bz2" + program_name = "tar" + else: + raise ValueError("Unknown compression algorithm") - # Set new format info - fmt = root.find(".//premis:format", namespaces=utils.NSMAP) - fmt.clear() - fd = etree.SubElement(fmt, utils.PREFIX_NS["premis"] + "formatDesignation") - etree.SubElement( - fd, utils.PREFIX_NS["premis"] + "formatName" - ).text = format_info.get("name") - etree.SubElement( - fd, utils.PREFIX_NS["premis"] + "formatVersion" - ).text = format_info.get("version") - fr = etree.SubElement(fmt, utils.PREFIX_NS["premis"] + "formatRegistry") - etree.SubElement( - fr, utils.PREFIX_NS["premis"] + "formatRegistryName" - ).text = format_info.get("registry_name") - etree.SubElement( - fr, utils.PREFIX_NS["premis"] + "formatRegistryKey" - ).text = format_info.get("registry_key") - - # Creating application info - now = utils.mets_file_now() - app = root.find(".//premis:creatingApplication", namespaces=utils.NSMAP) - app.clear() - etree.SubElement( - app, utils.PREFIX_NS["premis"] + "creatingApplicationName" - ).text = format_info.get("program_name") - etree.SubElement( - app, utils.PREFIX_NS["premis"] + "creatingApplicationVersion" - ).text = format_info.get("program_version") - etree.SubElement( - app, utils.PREFIX_NS["premis"] + "dateCreatedByApplication" - ).text = str(now) - - # Remove existing decompression transformFiles - to_delete = file_.findall( - './/mets:transformFile[@TRANSFORMTYPE="decompression"]', - namespaces=utils.NSMAP, - ) - for elem in to_delete: - file_.remove(elem) - # Add new ones - for elem in transform_file: - file_.append(elem) - - # Update compositionLevel - root.find(".//premis:compositionLevel", namespaces=utils.NSMAP).text = str( - len(file_.findall("mets:transformFile", namespaces=utils.NSMAP)) + reingest_premis_obj = premis.create_aip_premis_object( + premis_obj.identifier_value, + str(os.path.getsize(path)), + extension, + premis_obj.message_digest_algorithm, + checksum.hexdigest(), + program_name, + version, + composition_level=len(aip.transform_files), ) + new_techmd = aip.add_premis_object(reingest_premis_obj) + + # Mark the old techMD as superseded + current_techmd = None + for subsection in aip.amdsecs[0].subsections: + if subsection.subsection == "techMD" and subsection.status == "current": + current_techmd = subsection + break + + if current_techmd is None: + current_techmd = aip.amdsecs[0].subsections[0] + + current_techmd.replace_with(new_techmd) + # Write out pointer file again with open(self.full_pointer_file_path, "w") as f: - f.write( - etree.tostring( - root, pretty_print=True, xml_declaration=True, encoding="utf-8" - ) - ) + f.write(mets.tostring()) # SWORD-related methods def has_been_submitted_for_processing(self): @@ -3292,40 +3021,6 @@ def _get_checksum_report( return {"success": success, "message": message} -def _get_replication_derivation_relationship( - related_aip_uuid, replication_event_uuid, premis_version=None -): - """Return a PREMIS relationship of type derivation relating an implicit - PREMIS object (an AIP) to some to related AIP (with UUID - ``related_aip_uuid``) via a replication event with UUID - ``replication_event_uuid``. Note the complication wherein PREMIS v. 2.2 - uses 'Identification' where PREMIS v. 3.0 uses 'Identifier'. - """ - if not premis_version: - premis_version = premisrw.PREMIS_META["version"] - related_object_identifier = {"2.2": "related_object_identification"}.get( - premis_version, "related_object_identifier" - ) - related_event_identifier = {"2.2": "related_event_identification"}.get( - premis_version, "related_event_identifier" - ) - return ( - "relationship", - ("relationship_type", "derivation"), - ("relationship_sub_type", ""), - ( - related_object_identifier, - ("related_object_identifier_type", "UUID"), - ("related_object_identifier_value", related_aip_uuid), - ), - ( - related_event_identifier, - ("related_event_identifier_type", "UUID"), - ("related_event_identifier_value", replication_event_uuid), - ), - ) - - def write_pointer_file(pointer_file, pointer_file_path): """Write the pointer file to disk. creating intermediate directories as necessary. diff --git a/storage_service/locations/tests/test_gpg.py b/storage_service/locations/tests/test_gpg.py index c57fd1a95..eb493803e 100644 --- a/storage_service/locations/tests/test_gpg.py +++ b/storage_service/locations/tests/test_gpg.py @@ -7,13 +7,12 @@ import shutil import subprocess import tarfile -import unicodedata from django.test import TestCase from metsrw.plugins import premisrw import pytest -from common import utils, gpgutils +from common import gpgutils from locations.models import gpg, Package, space @@ -208,6 +207,7 @@ def test_move_to_storage_service( def test_move_from_storage_service( mocker, src_path, dst_path, package, encrypt_ret, expect ): + mocker.patch("locations.models.gpg._get_gpg_version", return_value=GPG_VERSION) orig_pkg_key = package and package.encryption_key_fingerprint if isinstance(encrypt_ret, Exception): mocker.patch.object(gpg, "_gpg_encrypt", side_effect=encrypt_ret) @@ -217,7 +217,10 @@ def test_move_from_storage_service( mocker.patch.object(gpg_space.space, "create_local_directory") mocker.patch.object(gpg_space.space, "move_rsync") encryption_event = 42 - mocker.patch.object(gpg, "create_encryption_event", return_value=encryption_event) + mocker.patch( + "locations.models.gpg.premis.create_encryption_event", + return_value=encryption_event, + ) if expect == "success": ret = gpg_space.move_from_storage_service(src_path, dst_path, package=package) if package.should_have_pointer_file(): @@ -364,22 +367,6 @@ def mock_isfile(path): assert gpg._get_encrypted_path("/a/b") is None -@pytest.mark.parametrize( - "inp, outp", - [ - ("abc", "abc"), - (u"change\u0301", u"change\u0301"), - (u"change\u0301".encode("utf8"), u"change\u0301"), - ( - unicodedata.normalize("NFC", u"change\u0301").encode("latin1"), - u"chang\uFFFD", - ), - ], -) -def test_escape(inp, outp): - assert outp == gpg.escape(inp) - - @pytest.mark.parametrize( "path, isfile, will_create_decrypt_file, decrypt_ret, expected", [ @@ -455,32 +442,6 @@ def test__parse_gpg_version(): assert GPG_VERSION == gpg._parse_gpg_version(RAW_GPG_VERSION) -def test_create_encryption_event(mocker): - mocker.patch.object(gpg, "_get_gpg_version", return_value=GPG_VERSION) - mocker.patch.object(utils, "get_ss_premis_agents", return_value=TEST_AGENTS) - stderr = 'me contain " quote' - encr_result = FakeGPGRet(ok=True, status=SUCCESS_STATUS, stderr=stderr) - event = gpg.create_encryption_event(encr_result, SOME_FINGERPRINT).data - assert event[0] == "event" - event = event[2:] - assert [x for x in event if x[0] == "event_type"][0][1] == "encryption" - assert [x for x in event if x[0] == "event_detail"][0][1] == ( - "program=GPG; version={}; key={}".format(GPG_VERSION, SOME_FINGERPRINT) - ) - eoi = [x for x in event if x[0] == "event_outcome_information"][0] - assert [x for x in eoi if x[0] == "event_outcome"][0][1] == "success" - assert [x for x in eoi if x[0] == "event_outcome_detail"][0][1][1] == ( - 'Status="{}"; Standard Error="{}"'.format( - SUCCESS_STATUS, stderr.replace('"', r"\"") - ) - ) - lai = [x for x in event if x[0] == "linking_agent_identifier"][0] - assert [x for x in lai if x[0] == "linking_agent_identifier_value"][0][1] == ( - "Archivematica-Storage-Service-{}".format(SS_VERSION) - ) - utils.get_ss_premis_agents.assert_called_once() - - @pytest.mark.parametrize( "path, will_be_dir, sp_raises, expected", [