-
Notifications
You must be signed in to change notification settings - Fork 33
/
meTypeset.py
executable file
·257 lines (204 loc) · 10.2 KB
/
meTypeset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/usr/bin/env python
"""meTypeset: text parsing library to convert word documents to the JATS XML format
Usage:
meTypeset.py doc <input> <output_folder> [options]
meTypeset.py docx <input> <output_folder> [options]
meTypeset.py docxextracted <input> <output_folder> [options]
meTypeset.py odt <input> <output_folder> [options]
meTypeset.py other <input> <output_folder> [options]
meTypeset.py tei <input> <output_folder> [options]
meTypeset.py bibscan <input> [options]
Options:
-a, --aggression <aggression_level> Parser aggression level 0-10 [default: 10]
--chain <xslt> Specify a subsequent XSL transform to pass the NLM to
-c, --clean Produce final XML, not intermediate markup with additional metadata
-d, --debug Enable debug output
-i, --identifiers Generate unique identifiers for all supported NLM elements
--includedeleted Keep deleted text (track changes)
--interactive Enable step-by-step interactive mode
-h, --help Show this screen.
-m, --metadata <metadata_file> Metadata file
--nogit Disable git debug filesystem (only of use with --debug)
--noimageprocessing Disable unoconv image processing
--nolink Do not run reference linker
--nometa Do not merge front matter
--purenlm Die after performing NLM XSLT step
--puretei Die after performing TEI XSLT step
--prettytei Indent and format intermediary TEI
-p, --proprietary Enables proprietary math parsing. Requires omml2mml.xsl
-s, --settings <settings_file> Settings file
-v, --version Show version.
-z, --zotero Enable Zotero integration for references.
"""
__author__ = "Martin Paul Eve, Dulip Withnage"
__email__ = "martin@martineve.com"
from docxtotei import *
from teitonlm import *
from sizeclassifier import *
from frontmatterparser import *
from docopt import docopt
from teimanipulate import TeiManipulate
from globals import *
from debug import Debuggable
from bibliographyaddins import BibliographyAddins
from bibliographydatabase import BibliographyDatabase
from bibliographyclassifier import BibliographyClassifier
from listclassifier import ListClassifier
from metadata import Metadata
from referencelinker import ReferenceLinker
from xslchainer import XslChain
from settingsconfiguration import Settings
from idgenerator import IdGenerator
from captionclassifier import CaptionClassifier
from complianceenforcer import ComplianceEnforcer
from interactive import Interactive
from unoconvtodocx import UnoconvToDocx
# check whether lxml is installed
try:
# noinspection PyUnresolvedReferences
from lxml import etree
except ImportError:
print("Failed to import lxml")
class MeTypeset (Debuggable):
def __init__(self):
# read command line arguments
self.args = self.read_command_line()
# absolute first priority is to initialize debugger so that anything triggered here can be logged
self.debug = Debug()
Debuggable.__init__(self, 'Main')
if self.args['--debug']:
self.debug.enable_debug(self.args['--nogit'])
# read settings file
self.settings_file_path = 'default'
self.tei_file_path = None
self.settings_file_path = Settings.setup_settings_file(self.args)
self.settings = Settings(Settings.get_settings_file(self, self.settings_file_path), self.args)
self.gv = GV(self.settings, self.debug)
self.debug.enable_prompt(Interactive(self.gv))
@staticmethod
def read_command_line():
return docopt(__doc__, version='meTypeset 0.1')
def set_metadata_file(self):
metadata_file_arg = self.settings.args['--metadata']
if metadata_file_arg:
metadata_file = self.gv.settings.clean_path(self.gv.settings.concat_path(self.settings.script_dir,
metadata_file_arg[0]))
else:
metadata_file = \
self.gv.settings.clean_path(
self.gv.settings.concat_path(self.settings.script_dir,
self.gv.settings.get_setting('default-metadata-file-path',
self)))
self.debug.print_debug(self, u'Metadata file wasn\'t specified. '
'Falling back to {0}'.format(metadata_file))
return metadata_file
def run_modules(self):
ag = int(self.gv.settings.args['--aggression'])
self.debug.print_debug(self,
u'Running at aggression level {0} {1}'.format(ag,
"[grrr!]" if ag == 10 else ""))
if ag > 10:
self.debug.print_debug(self, "WARNING: safety bail-out features are disabled at aggression level 11")
if self.args['bibscan']:
BibliographyDatabase(self.gv).scan()
else:
# check for stylesheets
self.gv.check_file_exists(self.gv.docx_style_sheet_dir)
# metadata file
gv.metadata_file = self.set_metadata_file()
self.gv.mk_dir(self.gv.output_folder_path)
if self.args['doc']:
# run doc to docx conversion
# then run docx to tei
UnoconvToDocx(self.gv).run('doc')
DocxToTei(self.gv).run(True, self.args['--proprietary'])
elif self.args['odt']:
# run odt to docx conversion
# then run docx to tei
UnoconvToDocx(self.gv).run('odt')
DocxToTei(self.gv).run(True, self.args['--proprietary'])
elif self.args['other']:
# run other unoconv-supported format to docx conversion
# then run docx to tei
UnoconvToDocx(self.gv).run('unoconv')
DocxToTei(self.gv).run(True, self.args['--proprietary'])
elif self.args['docx']:
# run docx to tei conversion
# includes hooks for proprietary transforms if enabled
DocxToTei(self.gv).run(True, self.args['--proprietary'])
elif self.args['docxextracted']:
self.debug.print_debug(self, u'Skipping docx extraction')
DocxToTei(self.gv).run(False, self.args['--proprietary'])
elif self.args['tei']:
self.debug.print_debug(self, u'Skipping docx extraction; processing TEI file')
DocxToTei(self.gv).run(False, self.args['--proprietary'], tei=True)
if self.args['--puretei']:
self.debug.print_debug(self, u'Exiting as TEI transform complete')
return
metadata = Metadata(self.gv)
metadata.pre_clean()
# run size classifier
# aggression 5
SizeClassifier(self.gv).run()
# run bibliographic addins handler
# aggression 4
found_bibliography = BibliographyAddins(self.gv).run()
# run list classifier
# aggression 4
ListClassifier(self.gv).run()
bibliography_classifier = BibliographyClassifier(self.gv)
if not found_bibliography:
# run bibliographic classifier
# aggression 4
bibliography_classifier.run()
# tei
# aggression 3
TeiManipulate(self.gv).run()
# run tei to nlm conversion
TeiToNlm(self.gv).run(not found_bibliography)
if self.gv.settings.args['--purenlm']:
self.debug.print_debug(self, u'Exiting as NLM transform complete')
return
manipulate = NlmManipulate(self.gv)
if not self.gv.used_list_method:
manipulate.fuse_references()
# run reference linker
if not (self.args['--nolink']):
rl = ReferenceLinker(self.gv)
rl.run(self.args['--interactive'])
rl.cleanup()
# run table classifier
cc = CaptionClassifier(self.gv)
if int(self.args['--aggression']) > int(self.gv.settings.get_setting('tablecaptions',
self, domain='aggression')):
cc.run_tables()
if int(self.args['--aggression']) > int(self.gv.settings.get_setting('graphiccaptions',
self, domain='aggression')):
cc.run_graphics()
cc.run_ext_link_compliance()
manipulate.double_p_compliance()
# run metadata merge
if not (self.args['--nometa']):
metadata.run()
if self.args['--interactive']:
bibliography_classifier.run_prompt(True)
# process any bibliography entries that are possible
BibliographyDatabase(self.gv).run()
# remove stranded titles and cleanup
manipulate.final_clean()
if self.args['--identifiers']:
IdGenerator(self.gv).run()
if self.args['--chain']:
# construct and run an XSLT chainer
XslChain(self.gv).run()
if self.args['--clean']:
ComplianceEnforcer(self.gv).run()
def run(self):
self.run_modules()
if not self.debug:
os.remove(self.gv.nlm_temp_file_path)
def main():
me_typeset_instance = MeTypeset()
me_typeset_instance.run()
if __name__ == '__main__':
main()