-
Notifications
You must be signed in to change notification settings - Fork 2
/
get_sample_and_template_from_raw.xslt
115 lines (105 loc) · 5.54 KB
/
get_sample_and_template_from_raw.xslt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
exclude-result-prefixes="#all"
version="2.0">
<!-- get_sample_and_template_from_raw.xslt -->
<!-- Read in a TEI P5 file (or any other XML file, really) that has both content -->
<!-- and comments; write out 2 extractions of the input file, 1 with everything -->
<!-- except the comments, the other with everything except the content. Write -->
<!-- those out to the particular directories we use in TAPAS. -->
<!-- Written 2017-03-16 by Syd Bauman -->
<!-- Updated 2017-06-08 by Syd Bauman: keep only the first N of any given sequence of a particular -->
<!-- element type in output template. N is specified in a processing instruction like: -->
<!-- <?tapas keepFirst=3 ?> -->
<!-- A value of '0' (the default) means keep 'em all. Output sample is not affected. This -->
<!-- allows us to generate samples with a dozen sample entries of some sort, but a template -->
<!-- that has only 2 or 3. (See e-mail _Re: TAPAS sample & template feature_ of 201-02-16.) -->
<xsl:output method="text" indent="yes"/>
<!-- Note: output of both explicit result files is XML, not text -->
<xsl:param name="debug" select="false()" as="xs:boolean"/>
<xsl:variable name="inpath" select="document-uri(/)"/>
<xsl:variable name="template" select="replace( $inpath,'raw_files/','template_files/')"/>
<xsl:variable name="sample" select="replace( $inpath,'raw_files/','sample_files/')"/>
<xsl:param name="keepFirst" as="xs:integer">
<xsl:choose>
<xsl:when test="/processing-instruction('tapas')/contains(.,'keepFirst')">
<xsl:value-of select="/processing-instruction('tapas')/replace( normalize-space(.),'.*keepFirst=.?([0-9]+).*','$1')"/>
</xsl:when>
<xsl:otherwise>0</xsl:otherwise>
</xsl:choose>
</xsl:param>
<xsl:template match="/">
<xsl:variable name="metaInfo">
<xsl:text>Run information — 
input: </xsl:text>
<xsl:value-of select="$inpath"/>
<xsl:text>
out 1: </xsl:text>
<xsl:value-of select="$sample"/>
<xsl:text>
out 2: </xsl:text>
<xsl:value-of select="$template"/>
<xsl:text>
keeping </xsl:text>
<xsl:value-of select="if ($keepFirst eq 0) then 'all' else concat('first ', $keepFirst )"/>
<xsl:text> of each sequence of siblings of same type for the template
timestamp: </xsl:text>
<xsl:value-of select="current-dateTime()"/>
<xsl:text>
</xsl:text>
</xsl:variable>
<xsl:value-of select="$metaInfo"/>
<xsl:choose>
<xsl:when test="$inpath = ( $sample, $template )">
<xsl:message terminate="yes">ERROR: input and one of the outputs are the same file (is input in raw_files/ directory?)</xsl:message>
</xsl:when>
<xsl:otherwise>
<xsl:result-document href="{$sample}" method="xml">
<xsl:apply-templates select="node()" mode="sample"/>
</xsl:result-document>
<xsl:result-document href="{$template}" method="xml">
<xsl:apply-templates select="node()" mode="template"/>
</xsl:result-document>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="node()" mode="#all">
<xsl:if test="not( ancestor::* )">
<xsl:text>
</xsl:text>
</xsl:if>
<xsl:copy>
<xsl:apply-templates select="@*|node()" mode="#current"/>
</xsl:copy>
</xsl:template>
<xsl:template match="@*" mode="#all">
<xsl:copy/>
</xsl:template>
<!-- ***** mode="template" ***** -->
<!-- for templates, ignore all but the first $keepFirst elements of same type in a row -->
<xsl:template match="*[ count( preceding-sibling::*[ name(.) eq name( current() )] ) ge $keepFirst]" mode="template" priority="3"/>
<!-- and ignore whitespace immediately after such nodes -->
<xsl:template match="text()[normalize-space(.) eq '']" mode="template" priority="3">
<xsl:variable name="elementIfollow" select="preceding-sibling::*[1]"/>
<xsl:choose>
<xsl:when test="count( $elementIfollow/preceding-sibling::*[ name(.) eq name( $elementIfollow )] ) ge $keepFirst"/>
<xsl:otherwise>
<xsl:value-of select="."/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="*[$keepFirst eq 0]" mode="template" priority="4">
<xsl:copy>
<xsl:apply-templates select="@*|node()" mode="#current"/>
</xsl:copy>
</xsl:template>
<!-- remove non-whitespace-only text nodes from templates -->
<xsl:template match="text()[ not( normalize-space(.) eq '') ]" mode="template" priority="3"/>
<!-- ***** mode="sample" ***** -->
<!-- remove comments and whitespace preceding a comment when there is also whitespace after said comment -->
<xsl:template match="comment()" mode="sample" priority="3"/>
<xsl:template mode="sample" priority="3"
match="text() (: text nodes :)
[ normalize-space(.) eq ''] (: that are whitespace only :)
[following-sibling::node()[1] (: whose closest following sibling node :)
[self::comment()] (: is a comment :)
] (: and whose :)
[following-sibling::node()[2] (: 2nd closest following sibling node :)
[self::text()[normalize-space(.) eq '']] (: is also whitespace only :)
]"
/>
</xsl:stylesheet>