-
Notifications
You must be signed in to change notification settings - Fork 33
/
docxtotei.py
executable file
·186 lines (149 loc) · 7.71 KB
/
docxtotei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python
import distutils
import os
import errno
import shutil
import zipfile
import subprocess
import re
import globals as gv
__author__ = "Dulip Withanage"
__email__ = "dulip.withanage@gmail.com"
from debug import Debuggable
from teimanipulate import TeiManipulate
from lxml import etree
class DocxToTei(Debuggable):
def __init__(self, gv):
self.gv = gv
self.debug = gv.debug
Debuggable.__init__(self, 'DOCX to TEI')
def saxon_doc_to_tei(self):
"""
Creates the appropriate java command to run Saxon
@return: a string to run on the command line
"""
cmd = ["java", "-classpath", self.gv.java_class_path,
"-Dxml.catalog.files="+self.gv.runtime_catalog_path,
"net.sf.saxon.Transform",
"-x", "org.apache.xml.resolver.tools.ResolvingXMLReader",
"-y", "org.apache.xml.resolver.tools.ResolvingXMLReader",
"-r", "org.apache.xml.resolver.tools.CatalogResolver",
"-o", self.gv.settings.clean_path(self.gv.tei_file_path),
self.gv.word_document_xml,
self.gv.docx_to_tei_stylesheet
]
return ' '.join(cmd)
def saxon_omml_to_mml(self):
"""
Creates the appropriate java command to run Saxon
@return: a string to run on the command line
"""
cmd = ["java", "-classpath", self.gv.java_class_path,
"-Dxml.catalog.files="+self.gv.runtime_catalog_path,
"net.sf.saxon.Transform",
"-x", "org.apache.xml.resolver.tools.ResolvingXMLReader",
"-y", "org.apache.xml.resolver.tools.ResolvingXMLReader",
"-r", "org.apache.xml.resolver.tools.CatalogResolver",
"-o", self.gv.word_document_xml,
self.gv.word_document_xml,
self.gv.proprietary_style_sheet
]
return ' '.join(cmd)
def handle_wmf(self):
"""
Calls unoconv to convert wmf images into png format. This method has a hard limit of 30 images.
@return: False if fails (more than 30 images), True otherwise
"""
image_filenames = os.listdir(self.gv.output_media_path)
if len(image_filenames) > 30:
self.debug.print_debug(self, u'Abandoning image conversion as there are over thirty images (DoS mitigation)')
return False
for image in image_filenames:
if re.match(r'.+?\.(w|e)mf', image) is not None:
image_name = re.sub(r'\.(w|e)mf', '', image)
imagemagick_command = '{3}*DELIMITER*-d*DELIMITER*graphics*DELIMITER*-f*DELIMITER*png*DELIMITER*-o' \
'*DELIMITER*{0}/{1}.png*DELIMITER*' \
'{0}/{2}'.format(self.gv.output_media_path, image_name, image,
self.gv.settings.get_setting('unoconv',
self))
self.debug.print_debug(self, u'Calling: {0}'.format(imagemagick_command.replace('*DELIMITER*', ' ')))
subprocess.call(imagemagick_command.split('*DELIMITER*'))
return True
def clean_proprietary(self):
p = etree.XMLParser(remove_blank_text=True, resolve_entities=False)
tree = etree.parse(self.gv.word_document_xml, p)
omml = tree.xpath('//m:oMath', namespaces={'m': 'http://schemas.openxmlformats.org/officeDocument/2006/math'})
for omml_paragraph in omml:
omml_paragraph.tag = '{http://www.w3.org/1998/Math/MathML}math'
etree.strip_tags(tree, '{http://schemas.openxmlformats.org/officeDocument/2006/math}oMathPara')
omml = tree.xpath('//m:oMathParaPr',
namespaces={'m': 'http://schemas.openxmlformats.org/officeDocument/2006/math'})
for omml_paragraph in omml:
omml_paragraph.getparent().remove(omml_paragraph)
tree.write(self.gv.word_document_xml)
def run(self, extract, run_proprietary, tei=False):
"""
This method converts from docx to TEI. It creates the necessary output folders, optionally extracts the file and
runs the Saxon process necessary to conduct the transform
@param extract: whether or not to extract a docx file. True to extract, False to work on a pre-extracted folder
@param run_proprietary: whether or not to run proprietary math transforms
"""
# make output folders
self.gv.mk_dir(self.gv.docx_temp_folder_path)
self.gv.mk_dir(self.gv.common2_temp_folder_path)
self.gv.mk_dir(self.gv.tei_folder_path)
#copy folders
self.gv.copy_folder(self.gv.common2_lib_path,
self.gv.common2_temp_folder_path, False, None)
self.gv.copy_folder(self.gv.docx_folder_path,
self.gv.docx_temp_folder_path, False, None)
if extract:
# decompress the docx
self.debug.print_debug(self, u'Unzipping {0} to {1}'.format(self.gv.input_file_path,
self.gv.docx_temp_folder_path))
with zipfile.ZipFile(self.gv.input_file_path, "r") as z:
z.extractall(self.gv.docx_temp_folder_path)
elif not tei:
self.gv.copy_folder(self.gv.input_file_path, self.gv.docx_temp_folder_path)
else:
shutil.copy2(self.gv.input_file_path, self.gv.tei_file_path)
self.debug.print_debug(self, u'Looking for presence of media directory {0}'.format(self.gv.docx_media_path))
if os.path.isdir(self.gv.docx_media_path):
self.debug.print_debug(self, u'Ripping out media directory')
self.gv.mk_dir(self.gv.output_media_path)
self.gv.copy_folder(self.gv.docx_media_path, self.gv.output_media_path, False, None)
if not self.gv.settings.args['--noimageprocessing']:
self.handle_wmf()
# copy input file into the docx subfolder
if extract:
shutil.copy(self.gv.input_file_path, self.gv.docx_temp_folder_path)
else:
pass
#self.gv.tei_file_path = self.gv.tei_file_path + 'tei.xml'
if not tei:
# fix dud LibreOffice conversion
doc_prop = open(os.path.join(self.gv.docx_temp_folder_path, 'docProps', 'core.xml'), 'r+')
contents = doc_prop.read()
contents = re.sub('\&(\s)', '\&\1', contents)
doc_prop.seek(0)
doc_prop.write(contents)
doc_prop.truncate()
doc_prop.close()
if run_proprietary:
# run a transform on the copied docx to generate a new version of the Word XML that includes MML
java_command = self.saxon_omml_to_mml()
self.debug.print_debug(self, u'Running saxon transform (DOCX->MML DOCX) [proprietary]')
subprocess.call(java_command, stdin=None, shell=True)
self.clean_proprietary()
# saxon converter
java_command = self.saxon_doc_to_tei()
self.debug.print_debug(self, u'Running saxon transform (DOCX->TEI)')
subprocess.call(java_command, stdin=None, shell=True)
# delete temp folders
if not self.gv.debug.debug:
shutil.rmtree(self.gv.docx_temp_folder_path)
shutil.rmtree(self.gv.common2_temp_folder_path)
if os.path.exists(self.gv.unoconv_folder_path):
shutil.rmtree(self.gv.unoconv_folder_path)
# update path to TEI from normalized saxon output
self.gv.tei_file_path = self.gv.settings.clean_path(self.gv.tei_file_path)