diff --git a/ocrd_page_to_alto/cli.py b/ocrd_page_to_alto/cli.py index ea30800..5e43621 100644 --- a/ocrd_page_to_alto/cli.py +++ b/ocrd_page_to_alto/cli.py @@ -16,7 +16,9 @@ @click.option('--dummy-textline/--no-dummy-textline', default=True, help='Whether to create a TextLine for regions that have TextEquiv/Unicode but no TextLine') @click.option('--dummy-word/--no-dummy-word', default=True, help='Whether to create a Word for TextLine that have TextEquiv/Unicode but no Word') @click.option('--textequiv-index', default=0, help='If multiple textequiv, use the n-th TextEquiv by @index') -@click.option('--textequiv-fallback-strategy', default='last', type=click.Choice(['raise', 'first', 'last']), help="What to do if nth textequiv isn't available. 'raise' will lead to a runtime error, 'first' will use the first TextEquiv, 'last' will use the last TextEquiv on the element") +@click.option('--textequiv-fallback-strategy', default='first', type=click.Choice(['raise', 'first', 'last']), + help="What to do if selected TextEquiv @index is not available: 'raise' will lead to a runtime error, " + "'first' will use the first TextEquiv, 'last' will use the last TextEquiv on the element") @click.option('--region-order', default='document', help="Order in which to iterate over the regions", type=click.Choice(['document', 'reading-order', 'reading-order-only'])) @click.option('--textline-order', default='document', help="Order in which to iterate over the textlines", type=click.Choice(['document', 'index', 'textline-order'])) @click.option('-O', '--output-file', default='-', help='Output filename (or "-" for standard output, the default)', diff --git a/ocrd_page_to_alto/utils.py b/ocrd_page_to_alto/utils.py index cb07d46..777e10f 100644 --- a/ocrd_page_to_alto/utils.py +++ b/ocrd_page_to_alto/utils.py @@ -38,16 +38,16 @@ def get_nth_textequiv(reg_page, textequiv_index, textequiv_fallback_strategy): if textequiv_fallback_strategy == 'raise': raise ValueError("PAGE element '%s' has no TextEquivs and fallback strategy is to raise" % reg_page.id) return '' - if len(textequivs) < textequiv_index + 1: - if textequiv_fallback_strategy == 'raise': - raise ValueError("PAGE element '%s' has only %d TextEquiv elements so cannot choose the %s%s and fallback strategy is to raise" % ( - reg_page.id, len(textequivs), textequiv_index + 1, 'st' if textequiv_index == 0 else 'nd')) - elif textequiv_fallback_strategy == 'first': - return textequivs[0].Unicode - else: - return textequivs[-1].Unicode + for textequiv in textequivs: + if textequiv.get_index() == textequiv_index: + return textequiv.Unicode + if textequiv_fallback_strategy == 'raise': + raise ValueError("PAGE element '%s' has no TextEquiv index %d" % ( + reg_page.id, textequiv_index)) + elif textequiv_fallback_strategy == 'first': + return textequivs[0].Unicode else: - return textequivs[textequiv_index].Unicode + return textequivs[-1].Unicode def contains(el, bbox): minx1, miny1, maxx1, maxy1 = bbox