Skip to content

Commit

Permalink
Merge pull request #23 from alix-tz/vpadding
Browse files Browse the repository at this point in the history
Ajout de l'option vpadding
  • Loading branch information
alix-tz authored Mar 22, 2021
2 parents a41c521 + cb032bc commit 0df4c05
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 56 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[![MIT License](https://img.shields.io/static/v1?style=plastic&label=license&message=MIT&color=brightgreen)](LICENSE) [![Version](https://img.shields.io/static/v1?style=plastic&label=version&message=0.3&color=blue)]()
[![MIT License](https://img.shields.io/static/v1?style=plastic&label=license&message=MIT&color=brightgreen)](LICENSE) [![Version](https://img.shields.io/static/v1?style=plastic&label=version&message=0.3.2&color=blue)]()

# ASPYRE GT

Expand Down Expand Up @@ -34,8 +34,13 @@ Process essential information to run Aspyre
:param source: path to source file (string)
[opt] :param destination: path to output (string)
[opt] :param talkative: activate a few print commands (bool)
[opt] :param vpadding: value to add to VPOS attr. in String nodes (int)
```

> supported values for `scenario`: "tkb", "pdfalto"
> `vpadding` is only used in PDFALTO scenario
##### Transkribus to eScriptorium scenario with `aspyre.TkbToEs()`
:warning: really not the best way to [transfer data between these two softwares](https://lectaurep.hypotheses.org/documentation/de-transkribus-a-escriptorium).

Expand Down
76 changes: 43 additions & 33 deletions aspyre/aspyrelib/aspyre.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def proceed(self):
"""return True if self.execution_status is 'Running'"""
return self.execution_status == "Running"

def __init__(self, scenario=None, source=None, destination=None, talkative=False, test_type=False):
def __init__(self, scenario=None, source=None, destination=None, talkative=False, test_type=False, vpadding=0):
"""Process essentiel information to run Aspyre
:param scenario: keyword describing the scenario
Expand All @@ -41,6 +41,8 @@ def __init__(self, scenario=None, source=None, destination=None, talkative=False
:type talkative: bool
:param test_type: simple initiation for test purpose
:type test_type: bool
:param vpadding: value to add to VPOS attributes in String nodes (PDFALTO scenario)
:type vpadding: int
"""
if test_type == True:
self.execution_status = "Debug"
Expand All @@ -51,7 +53,7 @@ def __init__(self, scenario=None, source=None, destination=None, talkative=False
# parsing talkative
self.talkative = talkative
if self.talkative:
utils.report("Talkative mode activated.", "H")
utils.report("Talkative mode activated.\n---", "H")

# parsing source
self.source = source
Expand All @@ -74,7 +76,7 @@ def __init__(self, scenario=None, source=None, destination=None, talkative=False
self.add_log(f"Output destination is now {self.destination}")
if talkative:
utils.report(f"'{destination}' is not a valid path!", "W")
utils.report(f"Output destination is now: {self.destination}.", "W")
utils.report(f"Output destination is now: {self.destination}.\n---", "W")
else:
self.destination = None

Expand All @@ -89,13 +91,33 @@ def __init__(self, scenario=None, source=None, destination=None, talkative=False
else:
self.scenario = None

if self.proceed():
# parsing vpadding
# only valid with PDFALTO scenario
if self.scenario != 'pdfalto':
self.vpadding = 0
else:
self.vpadding = vpadding

if self.vpadding == 0:
self.padding = False
else:
self.padding = True

if self.talkative:
if self.padding and self.scenario == 'pdfalto':
utils.report(f'Will add padding to y-axis coords in string nodes: {self.vpadding}\n---',
"H")
elif self.scenario == 'pdflato' and not self.padding:
utils.report(f"No modification made to y-axis coords in string nodes\n---", "H")


class TkbToEs():
def show_warning(self):
"""Display a message."""
utils.report("Transferring data from Transkribus to eScriptorium using ALTO files and Aspyre", "W")
utils.report("===/!\===\nTransferring data from Transkribus to eScriptorium using ALTO files and Aspyre", "W")
utils.report("is not recommended: Trankribus' ALTO is too poor to guarantee the validity of the", "W")
utils.report("resulting segments. Instead, use PAGE XML directly!", "W")
utils.report("resulting segments. Instead, use PAGE XML directly!\n===/!\===", "W")

def __init__(self, args):
"""Handle a Transkribus to eScriptorium transformation scenario
Expand All @@ -112,17 +134,17 @@ def __init__(self, args):
self.unzipped_source = None
if self.args.source.split(".")[-1] in ARCHIVE_EXTENSIONS:
if self.args.talkative:
utils.report("Source is an archive, running unzipping scenario.", "H")
utils.report("Source is an archive, running unzipping scenario.\n---", "H")
self.unzipped_source = zip.unzip_scenario(self.args.source, self.args.scenario)
if self.unzipped_source is False:
self.args.execution_status = "Failed"
self.add_log("Something went wrong while unpacking the source.")
utils.report("Failing at unpacking the archive, Apsyre can't proceed.", "E")
utils.report("Failing at unpacking the archive, Apsyre can't proceed.\n---", "E")
else:
self.args.add_log("Successfully unzipped source.")
else:
if self.args.talkative:
utils.report("Source is not an archive.", "H")
utils.report("Source is not an archive.\n---", "H")

if self.args.proceed():
# 2. collecting data
Expand All @@ -136,13 +158,13 @@ def __init__(self, args):
self.args.execution_status = 'Failed'
if self.args.talkative:
utils.report("Aspyre can't pair unreferenced images with the ALTO XML files", "E")
utils.report("Interrupting execution", "E")
utils.report("Interrupting execution!", "E")
elif self.alto_files is False:
self.args.add_log("Couldn't find any ALTO XML file.")
utils.report("Aspyre can't run Transkribus scenario without ALTO XML files.", "E")
utils.report("Aspyre can't run Transkribus scenario without ALTO XML files.\n---", "E")
self.args.execution_status = "Failed"
else:
self.args.add_log("Successfully collected data.")
self.args.add_log("Successfully collected data.\n---")

if self.args.proceed():
# 3. transforming files
Expand All @@ -156,7 +178,7 @@ def __init__(self, args):
manage_tkbtoes.handle_a_file(file, self)
except Exception as e:
if self.args.talkative:
utils.report(f"Error while processing {file} :", "E")
utils.report(f"===[!]===\nError while processing {file} :", "E")
print(e)
self.args.add_log(f"Failed to process {file}.")
else:
Expand All @@ -182,7 +204,7 @@ def __init__(self, args):
self.args.add_log('Aspyre ran Transkribus scenario successufully!')
else:
self.args = None
utils.report("Failed to run TkbToEs: args must be an AspyreArgs object!", "E")
utils.report("===[!]===\nFailed to run TkbToEs: args must be an AspyreArgs object!", "E")


class PdfaltoToEs():
Expand Down Expand Up @@ -216,39 +238,25 @@ def __init__(self, args):
self.unzipped_source = None
if self.args.source.split(".")[-1] in ARCHIVE_EXTENSIONS:
if self.args.talkative:
utils.report("Source is an archive, running unzipping scenario.", "H")
utils.report("Source is an archive, running unzipping scenario.\n---", "H")
self.unzipped_source = zip.unzip_scenario(self.args.source, self.args.scenario)
if self.unzipped_source is False:
self.args.execution_status = "Failed"
self.add_log("Something went wrong while unpacking the source.")
utils.report("Failing at unpacking the archive, Apsyre can't proceed.", "E")
utils.report("Failing at unpacking the archive, Apsyre can't proceed.\n---", "E")
else:
self.args.add_log("Successfully unzipped source.")
self.args.add_log("Successfully unzipped source.\n---")
else:
if self.args.talkative:
utils.report("Source is not an archive.", "H")
utils.report("Source is not an archive.\n---", "H")

if self.args.proceed():
# 2. collecting data
package = utils.list_directory(self.unzipped_source)

# TODO gérer la partie interaction avec les images

# self.image_files = manage_tkbtoes.extract_mets(package, self.unzipped_source)
self.alto_files, self.image_files = manage_pdfaltotoes.locate_alto_and_image_files(package)
"""
if len(self.image_files) == 0:
self.add_log("There is no reference to images in the METS XML file you provided.")
self.add_log("Make sure to check the \"Export Image\" option in Transkribus.")
self.args.execution_status = 'Failed'
if self.args.talkative:
utils.report("Aspyre can't pair unreferenced images with the ALTO XML files", "E")
utils.report("Interrupting execution", "E")
elif self.alto_files is False:
"""
if self.alto_files is False:
self.args.add_log("Couldn't find any XML file or any image file.")
utils.report("Aspyre can't run without either of these.", "E")
utils.report("Aspyre can't run without either of these.\n---", "E")
self.args.execution_status = "Failed"
else:
self.args.add_log("Successfully collected data.")
Expand Down Expand Up @@ -287,9 +295,11 @@ def __init__(self, args):
self.args.execution_status = "Failed"
self.args.add_log('Failed to zip output.')
else:
utils.report("Task completed ✓", "S")
self.args.execution_status = 'Finished'
self.args.add_log('Aspyre ran PDFALTO scenario successufully!')

else:
self.args = None
utils.report("Failed to run PdfaltoToEs: args must be an AspyreArgs object!", "E")
utils.report("Failed to run PdfaltoToEs: args must be an AspyreArgs object!\n===[!]===", "E")

61 changes: 50 additions & 11 deletions aspyre/aspyrelib/manage/manage_pdfaltotoes.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def clean_filename(xml_tree):
# we only keep the ideal image filename
filename_elem = xml_tree.find_all("fileName")[0]
filename_elem.string = filename_elem.string.split("||")[-1]
return xml_tree


## COORDINATES/RATIO RESOLUTION
Expand Down Expand Up @@ -216,6 +217,28 @@ def apply_ratio_to_coordinates(xml_tree):
return xml_tree


def apply_padding(xml_tree, vpadding):
"""Change values of VPOS attributes in String and TextLine nodes
:param xml_tree: parsed XML file
:type xml_tree: BeautifulSoup
:param vpadding: value to add to VPOS attributes
:type vpadding: int
"""
# un-necessary:
#for tl in xml_tree.find_all('TextLine'):
# if "VPOS" in tl.attrs:
# tl.attrs['VPOS'] = int(tl.attrs['VPOS'] + vpadding)
for st in xml_tree.find_all('String'):
if 'VPOS' in st.attrs:
st.attrs['VPOS'] = int(st.attrs['VPOS'] + vpadding)
# un-necessary:
#for sp in xml_tree.find_all('SP'):
# if "VPOS" in sp.attrs:
# sp.attrs['VPOS'] = int(sp.attrs['VPOS'] + vpadding)
return xml_tree



## COLLECTING INFORMATION && I/O
def locate_alto_and_image_files(package):
Expand All @@ -242,13 +265,14 @@ def locate_alto_and_image_files(package):
# if debug: see what it is ignored...
pass
if len(alto_files) == 0:
utils.report("Found no eligible XML file.")
utils.report("Found no eligible XML file.\n---")
return False, False
if len(image_files) == 0:
utils.report("Found no eligible image file.")
utils.report("Found no eligible image file.\n---")
return False, False
if len(image_files) != len(alto_files):
utils.report(f"Didn't find as many image ({len(image_files)}) as xml files ({len(alto_files)}).", "W")
utils.report(f"Didn't find as many images ({len(image_files)}) as xml files ({len(alto_files)}).", "W")
utils.report(f"It's not necessarily an issue.\n---", "W")
return alto_files, image_files


Expand Down Expand Up @@ -285,38 +309,53 @@ def handle_a_file(file, pdfalto_to_es_obj):
:type dest: str
:return: None
"""
if pdfalto_to_es_obj.args.padding:
length = 8
else:
length = 7

xml_tree = utils.read_file(file, 'xml')
pbar = tqdm(total=7, desc="Processing...", unit=" step")
pbar = tqdm(total=length, desc="Processing...", unit=" step")
pbar.update(1) # getting schema version
schemas = get_schema_spec(xml_tree)

if schemas:
if pdfalto_to_es_obj.args.talkative:
utils.report(f"Schema Specs: {schemas}", "H")
utils.report(f"Found the following schema specs declaration(s): {schemas}\n---", "H")
pbar.update(1) # controlling schema version
alto_version = control_schema_version(schemas)
if pdfalto_to_es_obj.args.talkative:
if alto_version:
utils.report(f"Detected ALTO version: v{alto_version}", "H")
utils.report(f"Detected ALTO version: v{alto_version}\n---", "H")

if alto_version == 3 or alto_version == 4: # even if the schema spec is ALTO 4, there may be other issues...
# and we still need to switch to SCRIPTA ALTO specs anyways...
if pdfalto_to_es_obj.args.talkative:
utils.report("Buckle up, we're fixing the schema declaration!", "H")
utils.report("Buckle up, we're fixing the schema declaration!\n---", "H")
pbar.update(1) # changing schema declaration to ALTO 4 (SCRIPTA flavored)
switch_to_v4(xml_tree)

if pdfalto_to_es_obj.args.talkative:
utils.report("I'm adding a <sourceImageInformation> element to point toward the image file", "H")
utils.report("I'm adding a <sourceImageInformation> element to point towards the image file\n---", "H")
pbar.update(1) # adding file name in source image information
add_sourceimageinformation(xml_tree, file, pdfalto_to_es_obj.image_files)
# modifier les coordonnées
if pdfalto_to_es_obj.args.talkative:
utils.report("Fixing the ratio (coordinates)", "H")
utils.report("Fixing the ratio (coordinates)\n---", "H")
pbar.update(1) # fixing baseline declarations
xml_tree = apply_ratio_to_coordinates(xml_tree)

if pdfalto_to_es_obj.args.padding:
if pdfalto_to_es_obj.args.talkative:
utils.report("Adjusting y-axis coords in textline and strings nodes\n---", "H")
pbar.update(1)
xml_tree = apply_padding(xml_tree, pdfalto_to_es_obj.args.vpadding)

if pdfalto_to_es_obj.args.talkative:
utils.report("Wrapping up", "H")
utils.report("Wrapping up\n---", "H")
pbar.update(1) # fixing baseline declarations
clean_filename(xml_tree)
xml_tree = clean_filename(xml_tree)

# TODO @alix: improve the saving process, obviously!
pbar.update(1) # saving file
save_processed_file(file.split(os.sep)[-1], xml_tree, pdfalto_to_es_obj.args.destination)
Expand Down
2 changes: 1 addition & 1 deletion aspyre/aspyrelib/manage/manage_tkbtoes.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def handle_a_file(file, tkb_to_es_obj):
pbar.update(1) # changing schema declaration to ALTO 4 (SCRIPTA flavored)
switch_to_v4(xml_tree)
if tkb_to_es_obj.args.talkative:
utils.report("I'm adding a <sourceImageInformation> element to point toward the image file", "H")
utils.report("I'm adding a <sourceImageInformation> element to point towards the image file", "H")
pbar.update(1) # adding file name in source image information
add_sourceimageinformation(xml_tree, file, tkb_to_es_obj.image_files)
if tkb_to_es_obj.args.talkative:
Expand Down
Loading

0 comments on commit 0df4c05

Please sign in to comment.