forked from mawanda-jun/TableTrainNet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbig_json_to_xml.py
97 lines (75 loc) · 2.76 KB
/
big_json_to_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
'''
Punti:
x0,y0
x1,y0
x0,y1
x1,y1
'''
import json
import pprint
import os
import xml.etree.ElementTree as ET
import xml.dom.minidom as xdm
INPUT_FILE = 'Latex_sample.json'
OUTPUT_PATH = './output/'
def collect_points(annotation):
x0 = annotation['segmentation'][0][0]
y0 = annotation['segmentation'][0][1]
x1 = annotation['segmentation'][0][4]
y1 = annotation['segmentation'][0][5]
points = [x0, y0, x1, y1]
return points
def write_on_new_xml(OUTPUT_PATH, file_name, points):
root = ET.Element('document')
root.attrib['filename'] = file_name[:-4]
tableRegion = ET.SubElement(root, 'tableRegion')
Coords = ET.SubElement(tableRegion, 'Coords')
x0 = points[0]
y0 = points[1]
x1 = points[2]
y1 = points[3]
stringified_points = "{},{} {},{} {},{} {},{}".format(x0, y0, x1, y0, x0, y1, x1, y1)
Coords.attrib['points'] = stringified_points
tree = ET.ElementTree(root)
tree.write(OUTPUT_PATH + file_name, encoding = "UTF-8", xml_declaration=True)
# aggiunge i punti di una tabella a un file già presente
def add_points_to_old_xml(old_file, points):
tree = ET.parse(old_file)
root = tree.getroot()
docum = root.find('document')
tableRegion = ET.SubElement(root, 'tableRegion')
Coords = ET.SubElement(tableRegion, 'Coords')
x0 = points[0]
y0 = points[1]
x1 = points[2]
y1 = points[3]
stringified_points = "{},{} {},{} {},{} {},{}".format(x0, y0, x1, y0, x0, y1, x1, y1)
Coords.attrib['points'] = stringified_points
tree.write(OUTPUT_PATH + file_name, encoding = "UTF-8", xml_declaration=True)
# MAIN
data = json.load(open(INPUT_FILE, 'r'))
#pprint.pprint(data, depth = 1) # mosta la struttura del json, 1° livello
number_of_images = len(data['images'])
number_of_annotations = len(data['annotations'])
# array contenente tutte le immagini
images = []
for i in range(number_of_images):
images.append(data['images'][i])
#print('IMAGES: ' + str(images[0:10]))
# array contenente tutte le annotazioni
annotations = []
for i in range(number_of_annotations):
annotations.append(data['annotations'][i])
#print(annotations[0:5])
image_ids = [] # id delle immagini già inserite
for i, annotation in enumerate(annotations):
image_id = annotations[i]['image_id']
image_index = image_id - 1 # id=1 corrisponde a elemento=0 dell'array images
file_name = str(images[image_index]['file_name'][:-4]) + '.xml' # il nome dell'xml è pari al nome dell'immagine, ma con estensione '.xml'
points = collect_points(annotation)
if image_id not in image_ids: # id non presente -> crea new xml file
# aggiunge l'id corrente agli id delle immagini inserite
image_ids.append(image_id)
write_on_new_xml(OUTPUT_PATH, file_name, points)
else: # id già presente -> inserisci annotazione di questa tabella in file xml già esistente
add_points_to_old_xml(OUTPUT_PATH + file_name, points)