Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

page to text: rewrite #151

Merged
merged 1 commit into from
Oct 23, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 94 additions & 37 deletions xslt/page__text.xsl
Original file line number Diff line number Diff line change
@@ -1,44 +1,101 @@
<?xml version="1.1" encoding="UTF-8"?>
<!--
Author: Philipp Zumstein
License: MIT
-->
<xsl:stylesheet version="2.0"
xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">

<xsl:output method="text" encoding="utf-8" indent="no" />
<xsl:strip-space elements="*"/>

<xsl:template match="/">
<xsl:apply-templates/>
</xsl:template>

<xsl:template match="*:Unicode">
<xsl:value-of select="current()"/>
</xsl:template>

<xsl:template match="node()">
<!-- Try to to the unicode text as soon as possible and then stop to going deeper in the tree. Otherwise there will be created the same text multiple times. -->
<xsl:stylesheet
version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15">
<!-- rid of xml syntax: -->
<xsl:output
method="text"
standalone="yes"
omit-xml-declaration="yes"/>
<!-- copy text element verbatim: -->
<xsl:variable name="newline"><xsl:text>
</xsl:text>
</xsl:variable>
<!-- paragraph break -->
<xsl:param name="pb" select="concat($newline,$newline)"/>
<!-- line break -->
<xsl:param name="lb" select="$newline"/>
<!-- text order: by element or by explicit ReadingOrder (reading-order|document) -->
<xsl:param name="order" select="'reading-order'"/>
<!-- hierarchy level to extract text annotation from (region|line|word|glyph|highest) -->
<xsl:param name="level" select="'highest'"/>
<!-- use key mechanism for IDREFs, because XSD does not support id mechanism -->
<xsl:key name="textRegion" match="pc:TextRegion" use="@id"/>
<xsl:template match="pc:PcGts/pc:Page">
<xsl:variable name="regions" select="//pc:TextRegion"/>
<xsl:choose>
<xsl:when test="./*:Unicode|./*:TextEquiv/*:Unicode">
<xsl:apply-templates select="./*:Unicode|./*:TextEquiv/*:Unicode"/>
</xsl:when>
<xsl:when test="starts-with($order, 'reading-order') and pc:ReadingOrder//*[@regionRef|@regionRefIndexed]">
<xsl:call-template name="getrefs">
<xsl:with-param name="group" select="pc:ReadingOrder/*"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:apply-templates/>
<xsl:for-each select="$regions">
<xsl:call-template name="getlines">
<xsl:with-param name="region" select="."/>
</xsl:call-template>
<xsl:value-of select="$pb"/>
</xsl:for-each>
</xsl:otherwise>
</xsl:choose>
<!-- Add space, new line or new page symbol when necessary. -->
</xsl:template>
<xsl:template name="getlines">
<xsl:param name="region"/>
<xsl:choose>
<xsl:when test="self::*:TextLine or self::*:TextRegion or self::*:TableRegion">
<xsl:text>&#x0a;</xsl:text>
</xsl:when>
<xsl:when test="self::*:Word">
<xsl:text>&#x20;</xsl:text>
<xsl:when test="$level='region' or $level='highest' and $region/pc:TextEquiv/pc:Unicode">
<xsl:value-of select="$region/pc:TextEquiv[1]/pc:Unicode" disable-output-escaping="yes"/>
</xsl:when>
<xsl:when test="self::*:Page">
<xsl:text>&#x0c;</xsl:text>
</xsl:when>
</xsl:choose>
<xsl:otherwise>
<xsl:for-each select="$region/pc:TextLine">
<xsl:if test="position()>1">
<xsl:value-of select="$lb"/>
</xsl:if>
<xsl:choose>
<xsl:when test="$level='line' or $level='highest' and pc:TextEquiv/pc:Unicode">
<xsl:value-of select="pc:TextEquiv[1]/pc:Unicode" disable-output-escaping="yes"/>
</xsl:when>
<xsl:otherwise>
<xsl:for-each select="pc:Word">
<xsl:if test="position()>1">
<xsl:text> </xsl:text>
</xsl:if>
<xsl:choose>
<xsl:when test="$level='word' or $level='highest' and pc:TextEquiv/pc:Unicode">
<xsl:value-of select="pc:TextEquiv[1]/pc:Unicode" disable-output-escaping="yes"/>
</xsl:when>
<xsl:otherwise>
<xsl:for-each select="pc:Glyph">
<xsl:value-of select="pc:TextEquiv[1]/pc:Unicode" disable-output-escaping="yes"/>
</xsl:for-each>
</xsl:otherwise>
</xsl:choose> <!-- word level? -->
</xsl:for-each>
</xsl:otherwise>
</xsl:choose> <!-- line level? -->
</xsl:for-each>
</xsl:otherwise>
</xsl:choose> <!-- region level? -->
</xsl:template>
<xsl:template name="getrefs">
<xsl:param name="group"/>
<xsl:for-each select="$group/*">
<xsl:sort select="@index" data-type="number"/>
<!--<xsl:variable name="region" select="id(@regionRef|@regionRefIndexed)"/>-->
<xsl:variable name="region" select="key('textRegion', @regionRef|@regionRefIndexed)"/>
<xsl:if test="$region">
<xsl:call-template name="getlines">
<xsl:with-param name="region" select="$region"/>
</xsl:call-template>
<xsl:value-of select="$pb"/>
</xsl:if>
<!-- UnorderedGroup(Indexed) and OrderedGroup(Indexed): recurse -->
<xsl:if test="contains(local-name(.), 'Group')">
<xsl:call-template name="getrefs">
<xsl:with-param name="group" select="."/>
</xsl:call-template>
</xsl:if>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
<!-- override implicit rules copying elements and attributes: -->
<xsl:template match="text()"/>
</xsl:stylesheet>