-
Notifications
You must be signed in to change notification settings - Fork 18
/
data_creator_step1.py
166 lines (138 loc) · 5.99 KB
/
data_creator_step1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
'''
Orchestrates the whole pipeline for generating the core-chain candidates.
'''
import datasetPreparation.create_dataset as cd
import traceback
import pathlib
import json
import sys
import os
# Checks if the location exists and if not create a new one.
create_dir = lambda dir_location: pathlib.Path(dir_location).mkdir(parents=True, exist_ok=True)
_save_location_success = 'data/data/raw/%(dataset)s/success'
_save_location_unsuccess = 'data/data/raw/%(dataset)s/unsuccess'
file_name = '.json'
def convert_qald_to_lcquad(dataset):
dataset = dataset['questions']
new_dataset = []
for index,node in enumerate(dataset):
d = dict(corrected_question=node['question'][0]['string'], verbalized_question=node['question'][0]['string'],
_id=index, sparql_query=node['query']['sparql'].replace('\n', ' '), sparql_template_id=999)
new_dataset.append(d)
return new_dataset
def run(_dataset,_save_location_success,_save_location_unsuccess,
_file_name,_predicate_blacklist,_relation_file,return_data,_qald=False):
'''
:param dataset: a list of data node.
:param _save_location_success: location where the data where everything is correct is stored.
:param _save_location_unsuccess: location where the data where there was some error is stored.
:param file_name: name f the file in which data is stored.
:param return_data: returns the success and unsucess data if true else nothing is returned
:return:
Note :- flag is used for determining whether the correct path was generated in the dataset
generation process or not.
data = {
'node' : _data_node,
'parsed_sparql' : '',
'path':[],
'entity':[],
'constraints':{},
'updated_sparql':'',
'hop1':[],
'hop2':[],
'error_flag':{
'path_found_in_data_generated':False,
'constraint_found_in_data_generated':False
},
'rdf_constraint' : {}
}
'''
# Creating dir, if it doesn't exists
create_dir(_save_location_success)
create_dir(_save_location_unsuccess)
fullpath_success = os.path.join(_save_location_success,_file_name)
fullpath_unsuccess = os.path.join(_save_location_unsuccess,_file_name)
counter = 0
cd_node = cd.CreateDataNode(_predicate_blacklist=_predicate_blacklist, _relation_file=_relation_file, _qald=_qald)
successful_data = []
unsuccessful_data = []
for node in _dataset:
try:
data = cd_node.dataset_preparation_time(_data_node=node,rdf=True)
data['error_flag']['aux_error'] = False
if data['error_flag']['path_found_in_data_generated'] and \
data['error_flag']['constraint_found_in_data_generated']:
successful_data.append(data)
else:
unsuccessful_data.append(data)
except:
temp = {}
temp['node'] = node
temp['error_flag'] = {}
temp['error_flag']['aux_error'] = traceback.format_exc()
unsuccessful_data.append(temp)
print ("done with, ", counter)
counter = counter + 1
if _qald:
new_unsuccessful_data = []
for u in unsuccessful_data:
try:
if u['error_flag']['path_found_in_data_generated'] == False:
new_unsuccessful_data.append(u)
except:
continue
unsuccessful_data = new_unsuccessful_data
json.dump(successful_data,open(fullpath_success,'w+'))
json.dump(unsuccessful_data,open(fullpath_unsuccess,'w+'))
print("the len of successfull data is ", len(successful_data))
print("the len of unsuccessfull data is ", len(unsuccessful_data))
if return_data:
return successful_data,unsuccessful_data
if __name__ == "__main__":
start_index = sys.argv[1]
end_index = sys.argv[2]
dataset = sys.argv[3]
if dataset == 'lcquad':
_save_location = 'data/data/raw/lcquad'
elif dataset == 'qald':
_save_location = 'data/data/raw/qald'
elif dataset == 'qg':
_save_location = 'data/data/raw/qg'
elif dataset == 'qg_copy':
_save_location = 'data/data/raw/qg_copy'
elif dataset == 'qg_customcopy':
_save_location = 'data/data/raw/qg_customcopy'
file_name = start_index+file_name
_save_location_success = _save_location_success % {'dataset':dataset}
_save_location_unsuccess = _save_location_unsuccess % {'dataset':dataset}
pb = open('resources/predicate.blacklist').readlines()
pb[-1] = pb[-1] + '\n'
pb = [r[:-1] for r in pb]
if dataset == 'lcquad':
_dataset = json.load(open('resources/lcquad_data_set.json'))
elif dataset == 'qald':
qald_train = json.load(open('resources/qald-7-train-multilingual.json'))
_dataset = convert_qald_to_lcquad(qald_train)
elif dataset == 'qg':
_dataset = json.load(open('resources/qg_version_1.json'))
elif dataset == 'qg_copy':
# Change the name and location later
_dataset = json.loads(open('resources/qg_version_1.json'))
elif dataset == 'qg_customcopy':
# Change the name and location later
_dataset = json.loads(open('resources/qg_version_1.json'))
if end_index == -1:
_dataset = _dataset[int(start_index):]
else:
_dataset = _dataset[int(start_index):int(end_index)]
__dataset = ['lcquad','qg','qg_customcopy','qg_copy']
if dataset in __dataset:
run(_dataset=_dataset, _save_location_success=_save_location_success
, _save_location_unsuccess=_save_location_unsuccess,
_file_name=file_name,
_predicate_blacklist=pb, _relation_file={}, return_data=False, _qald=False)
if dataset == 'qald':
run(_dataset=_dataset, _save_location_success=_save_location_success
, _save_location_unsuccess=_save_location_unsuccess,
_file_name=file_name,
_predicate_blacklist=pb, _relation_file={}, return_data=False, _qald=True)