forked from sshah2/testingAIUI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ParseDocForDMVGenAI - v2.py
289 lines (250 loc) · 13.2 KB
/
ParseDocForDMVGenAI - v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import openai
from docx import Document
import json
import re
import csv
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process
# Set up OpenAI API key from environment variable for security
openai.api_key = "***REMOVED***"
# Specify the path to the CSV file
csv_file_path = r'C:\Users\sashah\Desktop\Projects\CA DMV\AssessmentTask.csv'
# Helper function to clean text
def clean_text(text):
# Remove special characters and extra whitespace
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
return re.sub(r'\s+', ' ', text).strip()
# Load the data dictionary from the CSV file
def load_data_dict(csv_file_path):
data_dict = []
try:
with open(csv_file_path, mode='r', encoding='latin1') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
field_label = clean_text(row["FIELD LABEL"])
field_name = row["FIELD NAME"]
data_type = row["DATA TYPE"]
data_dict.append({"Field Label": field_label, "Field Name": field_name, "Data Type": data_type})
except FileNotFoundError:
print(f"File not found: {csv_file_path}")
exit(1)
return data_dict
data_dict = load_data_dict(csv_file_path)
# Create a TF-IDF vectorizer and fit it on the field labels
vectorizer = TfidfVectorizer().fit([item['Field Label'] for item in data_dict])
data_dict_vectors = vectorizer.transform([item['Field Label'] for item in data_dict])
# Load the Word document
file_path = r'C:\Users\sashah\Desktop\Projects\CA DMV\SCR0001-1.docx'
document = Document(file_path)
# Extract tables from the document
tables = document.tables
# Extract the table with fields data
fields_table_data, validation_table_data, business_table_data = None, None, None
for table in tables:
headers = [cell.text.strip() for cell in table.rows[0].cells]
if headers[:5] == ['Field #', 'Field Label', 'Field Type', 'Description', 'Field Length']:
fields_table_data = table
elif headers[:2] == ['V. Rule#', 'V. Rule Name']:
validation_table_data = table
elif headers[:2] == ['BR#', 'BR Name']:
business_table_data = table
# Define valid field types and mapping rules
valid_field_types = {
"Label with Radio buttons": "Radio Button",
"Textbox": "Textbox",
"Type ahead textbox": "Type Ahead Textbox",
"Picklist": "Picklist",
"Date": "Date",
"Label": "Label",
"Hyperlink": "Hyperlink"
}
# Function to use OpenAI API to refine field label matching
def refine_match(field_label, candidate_labels, candidate_field_names):
prompt = f"Match the following field label to the closest field name from the candidates:\n\nField Label: {field_label}\n\nCandidates:\n"
for label, field_name in zip(candidate_labels, candidate_field_names):
prompt += f"Field Label: {label}, Field Name: {field_name}\n"
prompt += "\nClosest match:"
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are an assistant that helps with matching field labels to field names."},
{"role": "user", "content": prompt}
],
max_tokens=50
)
match = response['choices'][0]['message']['content'].strip()
# Extracting the field name from the response
for label, field_name in zip(candidate_labels, candidate_field_names):
if field_name in match:
return field_name
return "Unknown__C"
# Function to match field labels using similarity and fuzzy matching
def match_field_label(field_label, data_dict, data_dict_vectors, top_n=10):
field_label_vector = vectorizer.transform([field_label])
similarities = cosine_similarity(field_label_vector, data_dict_vectors).flatten()
top_indices = similarities.argsort()[-top_n:][::-1]
candidate_labels = [data_dict[i]['Field Label'] for i in top_indices]
candidate_field_names = [data_dict[i]['Field Name'] for i in top_indices]
# Additional fuzzy matching
fuzzy_match = process.extractOne(field_label, candidate_labels)
if fuzzy_match:
match_label = fuzzy_match[0]
match_index = candidate_labels.index(match_label)
candidate_labels.insert(0, candidate_labels.pop(match_index))
candidate_field_names.insert(0, candidate_field_names.pop(match_index))
# Send top candidates to OpenAI for refinement
return refine_match(field_label, candidate_labels, candidate_field_names)
# Prepare the data for JSON conversion
def process_fields_table(fields_table_data):
json_data = []
if fields_table_data:
for i, row in enumerate(fields_table_data.rows[1:]): # Skip header row
try:
cells = row.cells
field_type = cells[2].text.strip()
mapped_field_type = valid_field_types.get(field_type, "Unknown")
picklist_values = clean_text(cells[6].text.strip())
required_rule, display_rule, validation_rule = "", "", ""
if "Required Rule:" in picklist_values:
required_rule = picklist_values.split("Required Rule:")[1].strip()
picklist_values = picklist_values.split("Required Rule:")[0].strip()
if "Display Rule:" in picklist_values:
display_rule = picklist_values.split("Display Rule:")[1].strip()
picklist_values = picklist_values.split("Display Rule:")[0].strip()
if "Validation Rule:" in picklist_values:
validation_rule = picklist_values.split("Validation Rule:")[1].strip()
picklist_values = picklist_values.split("Validation Rule:")[0].strip()
field_label = clean_text(cells[1].text.strip())
field_name = match_field_label(field_label, data_dict, data_dict_vectors)
field_info = next((item for item in data_dict if item["Field Name"] == field_name),
{"Field Name": "Unknown__C", "Data Type": "Unknown"})
field_dict = {
"#": clean_text(cells[0].text.strip()),
"Type": mapped_field_type,
"Field Label": field_label,
"Field API Name": field_info["Field Name"],
"Field Description": clean_text(cells[3].text.strip()),
"Field Type": field_info["Data Type"],
"Field Length": clean_text(cells[4].text.strip()),
"Required (Y/N)": clean_text(cells[5].text.strip()),
"Picklist Values / Radio Button Values": picklist_values,
"Error Message Trigger": "",
"Error Message": "",
"Business Rule Trigger": "",
"Business Logic": "",
"Business Rule Name": "",
"Business Rule Description": "",
"Business Rule Expression": "",
"Required Rule": required_rule,
"Display Rule": display_rule,
"Validation English Rule": validation_rule,
"Validation Rule": validation_rule,
"Validation Name": "",
"Readonly": "",
"Field": field_info["Field Name"],
"Object": "AssessmentTask",
"Row": "",
"Column": ""
}
json_data.append(field_dict)
except Exception as e:
print(f"Error processing row {i + 1}: {e}")
continue
return json_data
json_data = process_fields_table(fields_table_data)
# Convert the list to a JSON structure
json_output = {"data": json_data}
# Save the JSON to a file
output_file_path = r'C:\Users\sashah\Desktop\Projects\CA DMV\output_from_docx.json'
with open(output_file_path, 'w') as json_file:
json.dump(json_output, json_file, indent=4)
print(f"JSON output has been saved to {output_file_path}")
# Rule Identification and Conversion Process
# Function to identify the corresponding label for a validation rule
def identify_corresponding_label(rule_description, field_string):
prompt = (
f"Given the following rule description, break the rule into multiple tokens (variables, labels and constants), "
f"and from the tokens, identify which label corresponds to one of the labels in the provided field string.\n\n"
f"Rule Description: {rule_description}\n\n"
f"Field String: {field_string}\n\n"
f"Format the result as: Corresponding Label: <label>"
)
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are an assistant that helps with identifying corresponding labels for validation rules."},
{"role": "user", "content": prompt}
],
max_tokens=200
)
result = response['choices'][0]['message']['content'].strip()
#print(f"Identification Result: {result}") # Debugging: Print identification results
match = re.search(r'Corresponding Label:\s*(.*)', result)
if match:
return match.group(1).strip()
return None
# Create a string of all captured labels
field_labels = [field['Field Label'] for field in json_data]
field_string = ', '.join(field_labels)
# Process validation rules
def process_validation_rules(validation_table_data, json_data, field_string):
if validation_table_data:
for rule_row in validation_table_data.rows[1:]:
rule_cells = rule_row.cells
rule_description = clean_text(rule_cells[2].text.strip())
error_message = clean_text(rule_cells[3].text.strip())
corresponding_label = identify_corresponding_label(rule_description, field_string)
print("CL")
print(corresponding_label)
if corresponding_label:
for field in json_data:
if field["Field Label"] == corresponding_label:
field_api_name = field["Field API Name"]
field["Error Message"] = error_message
prompt = (
f"Given the following Rule Description, follow the following steps: "
f"1) Figure out all the labels that are present in the rule and identify which label "
f"corresponds to <{corresponding_label}> "
f"2) Now convert the Rule Description into a pseudo-code based validation condition – "
f"and in that validation condition, you will: "
f" a.Replace the label that corresponded to <{corresponding_label}> with {field_api_name} "
f" b.For all variables that are remaining in the rule, replace them with the format "
f"NOT_FOUND_VARIABLE_name "
f"c.Now, create the rest of the pseudo-code based validation condition using the "
f"appropriate constants and operators "
f"d. Do not include any user actions or messaging instructions in the validation condition"
f"3) The resulting final pseudo-code based validation condition should start with || and "
f"end the condition also with || so that I can extract it later \n\n"
f"Rule Description: {rule_description}\n\n"
)
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system",
"content": "You are an assistant that helps with identifying the appropriate pseudo code for validation rules."},
{"role": "user", "content": prompt}
],
max_tokens=200
)
#print(f"Prompt: {prompt}")
#print(f"Response: {response}")
result = response['choices'][0]['message']['content'].strip()
print(f"Result: {result}")
updated_rule_description = rule_description # Assigning the original rule description
# Updating with the result if it is not empty
if result:
pattern = re.escape("||") + r"(.*?)" + re.escape("||")
match = re.search(pattern, result, re.DOTALL)
if match:
updated_rule_description = match.group(1).strip()
# Ensuring the field's validation rule is updated
field["Validation Rule"] = updated_rule_description
break
process_validation_rules(validation_table_data, json_data, field_string)
# Save the updated JSON to a file
output_file_path = r'C:\Users\sashah\Desktop\Projects\CA DMV\updated_output_from_docx.json'
with open(output_file_path, 'w') as json_file:
json.dump(json_data, json_file, indent=4)
print(f"Updated JSON output has been saved to {output_file_path}")