Skip to content

Commit

Permalink
Merge pull request #328 from kba/loctype
Browse files Browse the repository at this point in the history
set LOCTYPE and OTHERLOCTYPE as necessary, fix #310
  • Loading branch information
kba authored Oct 24, 2019
2 parents 80c1f3a + 7dadcdd commit 949fe4b
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 5 deletions.
45 changes: 44 additions & 1 deletion ocrd_models/ocrd_models/ocrd_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class OcrdFile():
# def create(mimetype, ID, url, local_filename):
# el_fileGrp.SubElement('file')

def __init__(self, el, mimetype=None, instance=None, local_filename=None, mets=None, url=None):
def __init__(self, el, mimetype=None, loctype='OTHER', instance=None, local_filename=None, mets=None, url=None):
"""
Args:
el (LxmlElement):
Expand All @@ -33,6 +33,7 @@ def __init__(self, el, mimetype=None, instance=None, local_filename=None, mets=N
self.local_filename = local_filename
self._instance = instance
self.mets = mets
self.loctype = loctype

if url:
self.url = url
Expand Down Expand Up @@ -117,6 +118,48 @@ def pageId(self, pageId):
self.mets.set_physical_page_for_file(pageId, self)


@property
def loctype(self):
"""
Get the ``LOCTYPE``.
"""
el_FLocat = self._el.find('mets:FLocat', NS)
return '' if el_FLocat is None else el_FLocat.get('LOCTYPE')

@loctype.setter
def loctype(self, loctype):
"""
Set the ``LOCTYPE``.
"""
if loctype is None:
return
loctype = loctype.upper()
el_FLocat = self._el.find('mets:FLocat', NS)
if el_FLocat is None:
el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT)
el_FLocat.set('LOCTYPE', loctype)
if loctype == 'OTHER':
self.otherloctype = 'FILE'
else:
self.otherloctype = None

@property
def otherloctype(self):
el_FLocat = self._el.find('mets:FLocat', NS)
return '' if el_FLocat is None else el_FLocat.get('OTHERLOCTYPE')

@otherloctype.setter
def otherloctype(self, otherloctype):
el_FLocat = self._el.find('mets:FLocat', NS)
if el_FLocat is None:
el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT)
if not otherloctype:
if 'OTHERLOCTYPE' in el_FLocat.attrib:
del el_FLocat.attrib['OTHERLOCTYPE']
else:
el_FLocat.set('LOCTYPE', 'OTHER')
el_FLocat.set('OTHERLOCTYPE', otherloctype)

@property
def mimetype(self):
"""
Expand Down
2 changes: 1 addition & 1 deletion ocrd_validators/ocrd_validators/workspace_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def _validate_mets_files(self):
self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID)
if not f.pageId:
self.report.add_error("File '%s' does not manifest any physical page." % f.ID)
if 'url' not in self.skip and ':/' in f.url:
if 'url' not in self.skip and f.url and ':/' in f.url:
if re.match(r'^file:/[^/]', f.url):
self.report.add_warning("File '%s' has an invalid (Java-specific) file URL '%s'" % (f.ID, f.url))
scheme = f.url[0:f.url.index(':')]
Expand Down
19 changes: 17 additions & 2 deletions tests/cli/test_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,13 +185,17 @@ def test_remove_file_group(self):
def test_copy_vs_clone(self):
src_dir = assets.path_to('kant_aufklaerung_1784/data')
with TemporaryDirectory() as tempdir:
# cloned without download
shallowcloneddir = join(tempdir, 'cloned-shallow')
# cloned with download
fullcloneddir = join(tempdir, 'cloned-all')
# copied
copieddir = join(tempdir, 'copied')

Path(fullcloneddir).mkdir()
Path(shallowcloneddir).mkdir()


result = self.runner.invoke(workspace_cli, ['clone', join(src_dir, 'mets.xml'), shallowcloneddir])
self.assertEqual(result.exit_code, 0)

Expand All @@ -200,9 +204,20 @@ def test_copy_vs_clone(self):

with copy_of_directory(src_dir, copieddir):
shallow_vs_copied = dircmp(shallowcloneddir, copieddir)
full_vs_copied = dircmp(fullcloneddir, copieddir)
self.assertEqual(set(shallow_vs_copied.right_only), set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG']))
self.assertEqual(full_vs_copied.diff_files, [])

full_vs_copied = dircmp(fullcloneddir, copieddir)
# print(full_vs_copied)
# from ocrd_utils import pushd_popd
# with pushd_popd(tempdir):
# import os
# os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir))
# XXX mets.xml will not have the exact same content because
# URLs that are actually files will be marked up as such with
# @LOCTYPE/@OTHERLOCTYPE
# self.assertEqual(full_vs_copied.diff_files, [])
self.assertEqual(full_vs_copied.left_only, [])
self.assertEqual(full_vs_copied.right_only, [])

if __name__ == '__main__':
main()
13 changes: 13 additions & 0 deletions tests/model/test_ocrd_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from lxml import etree as ET
from tests.base import TestCase, main
from ocrd_models import OcrdFile

Expand All @@ -10,6 +11,18 @@ def test_no_pageid_without_mets(self):
with self.assertRaisesRegex(Exception, ".*has no member 'mets' pointing.*"):
f.pageId = 'foo'

def test_loctype(self):
f = OcrdFile(None)
self.assertEqual(f.loctype, 'OTHER')
self.assertEqual(f.otherloctype, 'FILE')
f.otherloctype = 'foo'
self.assertEqual(f.otherloctype, 'foo')
f.loctype = 'URN'
self.assertEqual(f.loctype, 'URN')
self.assertEqual(f.otherloctype, None)
f.otherloctype = 'foo'
self.assertEqual(f.loctype, 'OTHER')

def test_set_url(self):
f = OcrdFile(None)
f.url = None
Expand Down
2 changes: 1 addition & 1 deletion tests/test_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_workspace_add_file_basename_no_content(self):
ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
ws1.add_file('GRP', ID='ID1', mimetype='image/tiff')
f = ws1.mets.find_files()[0]
self.assertEqual(f.url, '')
self.assertEqual(f.url, None)

def test_workspace_add_file_binary_content(self):
with TemporaryDirectory() as tempdir:
Expand Down

0 comments on commit 949fe4b

Please sign in to comment.