Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ajout de l'option vpadding #23

Merged
merged 6 commits into from
Mar 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[![MIT License](https://img.shields.io/static/v1?style=plastic&label=license&message=MIT&color=brightgreen)](LICENSE) [![Version](https://img.shields.io/static/v1?style=plastic&label=version&message=0.3&color=blue)]()
[![MIT License](https://img.shields.io/static/v1?style=plastic&label=license&message=MIT&color=brightgreen)](LICENSE) [![Version](https://img.shields.io/static/v1?style=plastic&label=version&message=0.3.2&color=blue)]()

# ASPYRE GT

Expand Down Expand Up @@ -34,8 +34,13 @@ Process essential information to run Aspyre
:param source: path to source file (string)
[opt] :param destination: path to output (string)
[opt] :param talkative: activate a few print commands (bool)
[opt] :param vpadding: value to add to VPOS attr. in String nodes (int)
```

> supported values for `scenario`: "tkb", "pdfalto"

> `vpadding` is only used in PDFALTO scenario

##### Transkribus to eScriptorium scenario with `aspyre.TkbToEs()`
:warning: really not the best way to [transfer data between these two softwares](https://lectaurep.hypotheses.org/documentation/de-transkribus-a-escriptorium).

Expand Down
76 changes: 43 additions & 33 deletions aspyre/aspyrelib/aspyre.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def proceed(self):
"""return True if self.execution_status is 'Running'"""
return self.execution_status == "Running"

def __init__(self, scenario=None, source=None, destination=None, talkative=False, test_type=False):
def __init__(self, scenario=None, source=None, destination=None, talkative=False, test_type=False, vpadding=0):
"""Process essentiel information to run Aspyre

:param scenario: keyword describing the scenario
Expand All @@ -41,6 +41,8 @@ def __init__(self, scenario=None, source=None, destination=None, talkative=False
:type talkative: bool
:param test_type: simple initiation for test purpose
:type test_type: bool
:param vpadding: value to add to VPOS attributes in String nodes (PDFALTO scenario)
:type vpadding: int
"""
if test_type == True:
self.execution_status = "Debug"
Expand All @@ -51,7 +53,7 @@ def __init__(self, scenario=None, source=None, destination=None, talkative=False
# parsing talkative
self.talkative = talkative
if self.talkative:
utils.report("Talkative mode activated.", "H")
utils.report("Talkative mode activated.\n---", "H")

# parsing source
self.source = source
Expand All @@ -74,7 +76,7 @@ def __init__(self, scenario=None, source=None, destination=None, talkative=False
self.add_log(f"Output destination is now {self.destination}")
if talkative:
utils.report(f"'{destination}' is not a valid path!", "W")
utils.report(f"Output destination is now: {self.destination}.", "W")
utils.report(f"Output destination is now: {self.destination}.\n---", "W")
else:
self.destination = None

Expand All @@ -89,13 +91,33 @@ def __init__(self, scenario=None, source=None, destination=None, talkative=False
else:
self.scenario = None

if self.proceed():
# parsing vpadding
# only valid with PDFALTO scenario
if self.scenario != 'pdfalto':
self.vpadding = 0
else:
self.vpadding = vpadding

if self.vpadding == 0:
self.padding = False
else:
self.padding = True

if self.talkative:
if self.padding and self.scenario == 'pdfalto':
utils.report(f'Will add padding to y-axis coords in string nodes: {self.vpadding}\n---',
"H")
elif self.scenario == 'pdflato' and not self.padding:
utils.report(f"No modification made to y-axis coords in string nodes\n---", "H")


class TkbToEs():
def show_warning(self):
"""Display a message."""
utils.report("Transferring data from Transkribus to eScriptorium using ALTO files and Aspyre", "W")
utils.report("===/!\===\nTransferring data from Transkribus to eScriptorium using ALTO files and Aspyre", "W")
utils.report("is not recommended: Trankribus' ALTO is too poor to guarantee the validity of the", "W")
utils.report("resulting segments. Instead, use PAGE XML directly!", "W")
utils.report("resulting segments. Instead, use PAGE XML directly!\n===/!\===", "W")

def __init__(self, args):
"""Handle a Transkribus to eScriptorium transformation scenario
Expand All @@ -112,17 +134,17 @@ def __init__(self, args):
self.unzipped_source = None
if self.args.source.split(".")[-1] in ARCHIVE_EXTENSIONS:
if self.args.talkative:
utils.report("Source is an archive, running unzipping scenario.", "H")
utils.report("Source is an archive, running unzipping scenario.\n---", "H")
self.unzipped_source = zip.unzip_scenario(self.args.source, self.args.scenario)
if self.unzipped_source is False:
self.args.execution_status = "Failed"
self.add_log("Something went wrong while unpacking the source.")
utils.report("Failing at unpacking the archive, Apsyre can't proceed.", "E")
utils.report("Failing at unpacking the archive, Apsyre can't proceed.\n---", "E")
else:
self.args.add_log("Successfully unzipped source.")
else:
if self.args.talkative:
utils.report("Source is not an archive.", "H")
utils.report("Source is not an archive.\n---", "H")

if self.args.proceed():
# 2. collecting data
Expand All @@ -136,13 +158,13 @@ def __init__(self, args):
self.args.execution_status = 'Failed'
if self.args.talkative:
utils.report("Aspyre can't pair unreferenced images with the ALTO XML files", "E")
utils.report("Interrupting execution", "E")
utils.report("Interrupting execution!", "E")
elif self.alto_files is False:
self.args.add_log("Couldn't find any ALTO XML file.")
utils.report("Aspyre can't run Transkribus scenario without ALTO XML files.", "E")
utils.report("Aspyre can't run Transkribus scenario without ALTO XML files.\n---", "E")
self.args.execution_status = "Failed"
else:
self.args.add_log("Successfully collected data.")
self.args.add_log("Successfully collected data.\n---")

if self.args.proceed():
# 3. transforming files
Expand All @@ -156,7 +178,7 @@ def __init__(self, args):
manage_tkbtoes.handle_a_file(file, self)
except Exception as e:
if self.args.talkative:
utils.report(f"Error while processing {file} :", "E")
utils.report(f"===[!]===\nError while processing {file} :", "E")
print(e)
self.args.add_log(f"Failed to process {file}.")
else:
Expand All @@ -182,7 +204,7 @@ def __init__(self, args):
self.args.add_log('Aspyre ran Transkribus scenario successufully!')
else:
self.args = None
utils.report("Failed to run TkbToEs: args must be an AspyreArgs object!", "E")
utils.report("===[!]===\nFailed to run TkbToEs: args must be an AspyreArgs object!", "E")


class PdfaltoToEs():
Expand Down Expand Up @@ -216,39 +238,25 @@ def __init__(self, args):
self.unzipped_source = None
if self.args.source.split(".")[-1] in ARCHIVE_EXTENSIONS:
if self.args.talkative:
utils.report("Source is an archive, running unzipping scenario.", "H")
utils.report("Source is an archive, running unzipping scenario.\n---", "H")
self.unzipped_source = zip.unzip_scenario(self.args.source, self.args.scenario)
if self.unzipped_source is False:
self.args.execution_status = "Failed"
self.add_log("Something went wrong while unpacking the source.")
utils.report("Failing at unpacking the archive, Apsyre can't proceed.", "E")
utils.report("Failing at unpacking the archive, Apsyre can't proceed.\n---", "E")
else:
self.args.add_log("Successfully unzipped source.")
self.args.add_log("Successfully unzipped source.\n---")
else:
if self.args.talkative:
utils.report("Source is not an archive.", "H")
utils.report("Source is not an archive.\n---", "H")

if self.args.proceed():
# 2. collecting data
package = utils.list_directory(self.unzipped_source)

# TODO gérer la partie interaction avec les images

# self.image_files = manage_tkbtoes.extract_mets(package, self.unzipped_source)
self.alto_files, self.image_files = manage_pdfaltotoes.locate_alto_and_image_files(package)
"""
if len(self.image_files) == 0:
self.add_log("There is no reference to images in the METS XML file you provided.")
self.add_log("Make sure to check the \"Export Image\" option in Transkribus.")
self.args.execution_status = 'Failed'
if self.args.talkative:
utils.report("Aspyre can't pair unreferenced images with the ALTO XML files", "E")
utils.report("Interrupting execution", "E")
elif self.alto_files is False:
"""
if self.alto_files is False:
self.args.add_log("Couldn't find any XML file or any image file.")
utils.report("Aspyre can't run without either of these.", "E")
utils.report("Aspyre can't run without either of these.\n---", "E")
self.args.execution_status = "Failed"
else:
self.args.add_log("Successfully collected data.")
Expand Down Expand Up @@ -287,9 +295,11 @@ def __init__(self, args):
self.args.execution_status = "Failed"
self.args.add_log('Failed to zip output.')
else:
utils.report("Task completed ✓", "S")
self.args.execution_status = 'Finished'
self.args.add_log('Aspyre ran PDFALTO scenario successufully!')

else:
self.args = None
utils.report("Failed to run PdfaltoToEs: args must be an AspyreArgs object!", "E")
utils.report("Failed to run PdfaltoToEs: args must be an AspyreArgs object!\n===[!]===", "E")

61 changes: 50 additions & 11 deletions aspyre/aspyrelib/manage/manage_pdfaltotoes.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def clean_filename(xml_tree):
# we only keep the ideal image filename
filename_elem = xml_tree.find_all("fileName")[0]
filename_elem.string = filename_elem.string.split("||")[-1]
return xml_tree


## COORDINATES/RATIO RESOLUTION
Expand Down Expand Up @@ -216,6 +217,28 @@ def apply_ratio_to_coordinates(xml_tree):
return xml_tree


def apply_padding(xml_tree, vpadding):
"""Change values of VPOS attributes in String and TextLine nodes

:param xml_tree: parsed XML file
:type xml_tree: BeautifulSoup
:param vpadding: value to add to VPOS attributes
:type vpadding: int
"""
# un-necessary:
#for tl in xml_tree.find_all('TextLine'):
# if "VPOS" in tl.attrs:
# tl.attrs['VPOS'] = int(tl.attrs['VPOS'] + vpadding)
for st in xml_tree.find_all('String'):
if 'VPOS' in st.attrs:
st.attrs['VPOS'] = int(st.attrs['VPOS'] + vpadding)
# un-necessary:
#for sp in xml_tree.find_all('SP'):
# if "VPOS" in sp.attrs:
# sp.attrs['VPOS'] = int(sp.attrs['VPOS'] + vpadding)
return xml_tree



## COLLECTING INFORMATION && I/O
def locate_alto_and_image_files(package):
Expand All @@ -242,13 +265,14 @@ def locate_alto_and_image_files(package):
# if debug: see what it is ignored...
pass
if len(alto_files) == 0:
utils.report("Found no eligible XML file.")
utils.report("Found no eligible XML file.\n---")
return False, False
if len(image_files) == 0:
utils.report("Found no eligible image file.")
utils.report("Found no eligible image file.\n---")
return False, False
if len(image_files) != len(alto_files):
utils.report(f"Didn't find as many image ({len(image_files)}) as xml files ({len(alto_files)}).", "W")
utils.report(f"Didn't find as many images ({len(image_files)}) as xml files ({len(alto_files)}).", "W")
utils.report(f"It's not necessarily an issue.\n---", "W")
return alto_files, image_files


Expand Down Expand Up @@ -285,38 +309,53 @@ def handle_a_file(file, pdfalto_to_es_obj):
:type dest: str
:return: None
"""
if pdfalto_to_es_obj.args.padding:
length = 8
else:
length = 7

xml_tree = utils.read_file(file, 'xml')
pbar = tqdm(total=7, desc="Processing...", unit=" step")
pbar = tqdm(total=length, desc="Processing...", unit=" step")
pbar.update(1) # getting schema version
schemas = get_schema_spec(xml_tree)

if schemas:
if pdfalto_to_es_obj.args.talkative:
utils.report(f"Schema Specs: {schemas}", "H")
utils.report(f"Found the following schema specs declaration(s): {schemas}\n---", "H")
pbar.update(1) # controlling schema version
alto_version = control_schema_version(schemas)
if pdfalto_to_es_obj.args.talkative:
if alto_version:
utils.report(f"Detected ALTO version: v{alto_version}", "H")
utils.report(f"Detected ALTO version: v{alto_version}\n---", "H")

if alto_version == 3 or alto_version == 4: # even if the schema spec is ALTO 4, there may be other issues...
# and we still need to switch to SCRIPTA ALTO specs anyways...
if pdfalto_to_es_obj.args.talkative:
utils.report("Buckle up, we're fixing the schema declaration!", "H")
utils.report("Buckle up, we're fixing the schema declaration!\n---", "H")
pbar.update(1) # changing schema declaration to ALTO 4 (SCRIPTA flavored)
switch_to_v4(xml_tree)

if pdfalto_to_es_obj.args.talkative:
utils.report("I'm adding a <sourceImageInformation> element to point toward the image file", "H")
utils.report("I'm adding a <sourceImageInformation> element to point towards the image file\n---", "H")
pbar.update(1) # adding file name in source image information
add_sourceimageinformation(xml_tree, file, pdfalto_to_es_obj.image_files)
# modifier les coordonnées
if pdfalto_to_es_obj.args.talkative:
utils.report("Fixing the ratio (coordinates)", "H")
utils.report("Fixing the ratio (coordinates)\n---", "H")
pbar.update(1) # fixing baseline declarations
xml_tree = apply_ratio_to_coordinates(xml_tree)

if pdfalto_to_es_obj.args.padding:
if pdfalto_to_es_obj.args.talkative:
utils.report("Adjusting y-axis coords in textline and strings nodes\n---", "H")
pbar.update(1)
xml_tree = apply_padding(xml_tree, pdfalto_to_es_obj.args.vpadding)

if pdfalto_to_es_obj.args.talkative:
utils.report("Wrapping up", "H")
utils.report("Wrapping up\n---", "H")
pbar.update(1) # fixing baseline declarations
clean_filename(xml_tree)
xml_tree = clean_filename(xml_tree)

# TODO @alix: improve the saving process, obviously!
pbar.update(1) # saving file
save_processed_file(file.split(os.sep)[-1], xml_tree, pdfalto_to_es_obj.args.destination)
Expand Down
2 changes: 1 addition & 1 deletion aspyre/aspyrelib/manage/manage_tkbtoes.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def handle_a_file(file, tkb_to_es_obj):
pbar.update(1) # changing schema declaration to ALTO 4 (SCRIPTA flavored)
switch_to_v4(xml_tree)
if tkb_to_es_obj.args.talkative:
utils.report("I'm adding a <sourceImageInformation> element to point toward the image file", "H")
utils.report("I'm adding a <sourceImageInformation> element to point towards the image file", "H")
pbar.update(1) # adding file name in source image information
add_sourceimageinformation(xml_tree, file, tkb_to_es_obj.image_files)
if tkb_to_es_obj.args.talkative:
Expand Down
Loading