-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuber_maker.py
186 lines (139 loc) · 9.03 KB
/
uber_maker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
'''
This script uses the lxml library to parse a directory of folders containing .xml files with similar names and content.
This is accomplished in the following steps:
1. Determining which of the files, if any, have publicationstatus="full" attribute, hereby referred to as the "primary" file
2. Saving the primary file as the base for a new master file that will contain all the disparate subelements, hereby referred to as the "uberfile"
3. To iterate through the rest of the files and find the necessary subelements, and then writing them to the uberfile at the proper hierarchy position*
*step 3 is done with the help of the index_insert function found in index_insert.py
To decrease runtime, the program uses exclusion tags. These are the tags of missing elements that appeared during the preliminary runs of this program when it ran every element in the source_file against every element in the dest_file. Now only the elements with the exclusion tags will be compared. In order to improve the exclusion list, 1 in every n tags is let in randomly to see if it is a missing tag. If the element is indeed missing from the dest_file it is added to the exclusion list as a new tag.
'''
from lxml import etree as et
import os
from random import randint
import datetime as dt
from index_inserter import index_inserter
from multiprocessing import Pool
# all paths that don't end in a file must end with '/'!
# uber_maker function accepts 3 arguments: a source directory s_dir, and destination directory "d_dir", a log file directory 'l_dir', and a exclusion file destination "e_file"
def uber_maker(s_dir,d_dir,l_dir,e_file):
# make destination file if it doesn't already exist
os.mkdir(d_dir) if os.path.isdir(d_dir) != True else print('Uber destination: '+d_dir)
# exclusions list populated with elements in the exclusion file. len(line)-1 is used to avoid copying the '\n' escape character at the end of each tag
e_list = [line[:len(line)-1] for line in open(e_file,'r')]
# a file of completed ubers is kept to avoid duplicates during reruns. opened as an appenable file
c_log = d_dir+'completed.txt'
open(c_log,'a')
# a list of completed files is created to compare below
c_list = [line[:len(line)-1] for line in open(c_log, 'r')]
# a log file is created in the destination directory and named using datetime library for unique filenames
# log is encoded in utf-8, because certain characters (e.g. the section symbol) will throw a >128 ordinal range error if they are not encoded
os.mkdir(l_dir+'uber_maker_output_logs') if os.path.isdir(l_dir+'uber_maker_output_logs') != True else print('Log destination: '+l_dir+'uber_maker_output_logs')
log=open(l_dir+'uber_maker_output_logs/'+str(dt.datetime.now())+'output.txt',"a",encoding='utf-8')
# loop to count files and output % completion
total = 0
done = 0
for file in os.listdir(s_dir):
total += 1
# iteration through every file in the source directory
for file in os.listdir(s_dir):
# check to see if file is in the completed list, skipping if it is
if file in c_list:
print (file+" is in completed list. Skipped...")
# else loop containing main body of function
else:
# creating a temporary file where non-primary files will be sent during the comparison portion of the code
if os.path.isdir(d_dir+'temp') != True:
os.mkdir(d_dir+'temp')
# if the temp file isn't empty this loop removes all the files so it can start fresh
else:
for tmp in os.listdir(d_dir+'temp'):
os.remove(d_dir+'temp'+'/'+tmp)
# for loop for finding the primary file among the .xml files in the "file"
for xml in os.listdir(s_dir+file):
# lxml call to create root and tree
uber_xml = ''
try:
tree = et.parse(s_dir+file+'/'+xml)
root = tree.getroot()
# if the xml has the status="full" tag it is the primary
if et.iselement(root.find('.//*[@status="full"]')) == True or len(os.listdir(s_dir+file))==1:
# is written to the destination directory
uber_xml = 'uber' + file +'.xml'
tree.write(d_dir+uber_xml)
print (str(done/total*100)+'% complete... '+uber_xml + 'written...')
# non-uberfiles written to temp file
else:
tree.write(d_dir+'temp'+'/'+xml)
print (str(done/total*100)+'% complete... '+xml + '-temp written...')
except SyntaxError:
print ('Extra content at the end of the document. Deleted')
os.remove(s_dir+file+'/'+xml)
# The uber_root is made. If no uber_root was made in the previous block, the last temp xml is used as the uber_roots
if uber_xml == '':
uber_xml = 'uber' + file +'.xml'
tree.write(d_dir+uber_xml)
else:
pass
uber_tree = et.parse(d_dir+uber_xml)
uber_root = uber_tree.getroot()
# create uber_list from uber_root
uber_list = []
for el in uber_root.iter():
# uber_list is the list of all elements in the uber_file that have tags matching those in the exclusion list
# if the exclusion list is empty all tags from the uber_file are added and then filtered in the next block
if el.tag in e_list or e_list == []:
# attributes of the element are unique enough to be a good comparison
uber_list.append(el.attrib)
print (str(done/total*100)+'% complete... '+str(el.tag)+' appened to uber_list')
# tags are randomly let in to improve the pool of exclusion tags
elif randint(1,10) == 10:
uber_list.append(el.attrib)
print (str(done/total*100)+'% complete... '+'random '+str(el.tag)+ 'appended to uber_list')
# tags that aren't accepted are passed
else:
pass
# copy missing elements from source file in the temp folder to uber_file
for xml in os.listdir(d_dir+'temp'):
# add '#' break to log to improve readability
log.write("#\n"*5)
# add header of source and destination to log
log.write(xml+' to '+uber_xml)
# create source root from xml
source_root = et.parse(d_dir+'temp'+'/'+xml).getroot()
# parse through source_file iterable and use index insert function
for el in source_root.iter():
# only elements with tags in the exclusion list are allowed through or if the exclusion list is empty
if el.tag in e_list or e_list == []:
# elements with attributes matching those found in the uber_list are removed
if el.attrib in uber_list:
print (str(done/total*100)+'% complete... '+el.tag + " already in uber_list. Skipping...")
# these elements are missing from the destination uber
else:
# element inserted into the proper hierarchy using index_insert function
index_inserter(el,uber_tree,d_dir+uber_xml,uber_xml,log)
# element attributes are added to the uber_list to avoid duplicates
uber_list.append(el.attrib)
# if the element tag isn't in the exclusion list it is added as a new line
if str(el.tag) not in e_list:
open(e_file,"a").write(str(el.tag)+"\n")
else:
pass
else:
pass
print(str(done/total*100)+'% complete... ')
# the xml is removed from the temp file
os.remove(d_dir+'temp'+'/'+xml)
##when file completed write to c_file##
open(c_log,'a').write(file+"\n")
##delete temp file##
os.rmdir(d_dir+'temp')
done += 1
##test###
# s = 'case_files/'
# d = 'uberfiles/'
# e = 'xml-database/exclusions.txt'
# pool = Pool()
# try:
# pool.apply(uber_maker(s,d,e))
# except TypeError:
# print complete