From e8455532013278b53f434f3b85905e9c8b59a921 Mon Sep 17 00:00:00 2001 From: Misty De Meo Date: Sat, 11 Mar 2017 16:03:29 +1100 Subject: [PATCH] prepare: handle fully-qualified resource URLs Prior to PRONOM 89, URLs were missing the scheme; there is now a mixture of fully-qualified URLs and URLs without schemes. Treating them naively caused the fetching in prepare to fail. --- fido/prepare.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fido/prepare.py b/fido/prepare.py index 1ea00914..86d10b37 100644 --- a/fido/prepare.py +++ b/fido/prepare.py @@ -14,6 +14,7 @@ from six.moves import cStringIO from six.moves.urllib.request import urlopen +from six.moves.urllib.parse import urlparse from .pronomutils import get_local_pronom_versions @@ -272,7 +273,11 @@ def parse_pronom_xml(self, source, puid_filter=None): for id in x.findall(TNA('ReferenceFileIdentifier')): type = get_text_tna(id, 'IdentifierType') if type == 'URL': - url = "http://" + get_text_tna(id, 'Identifier') + # Starting with PRONOM 89, some URLs contain http:// + # and others do not. + url = get_text_tna(id, 'Identifier') + if not urlparse(url).scheme: + url = "http://" + url ET.SubElement(rf, 'dc:identifier').text = url # And calculate the checksum of this resource: m = hashlib.md5()