-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_preprocessing_generator_one_dataset.py
114 lines (99 loc) · 6.68 KB
/
data_preprocessing_generator_one_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
## The steps about data preprocessing for explanation generator
## Step 1: remove the question, options which is nan and including '<img' tag and the total rating lower than 10.
## Step 2: clean the question, options and explantion where has the html tag and extra white space.
## Step 3: remove the the explanation where there is nothing after cleaning the explanation.
import pandas as pd
import json
from bs4 import BeautifulSoup
import random
cardiff_all_question=pd.read_excel('./Paul_new_data/Cardiff_all_questions.xlsx')
# Sydney_all_questions=pd.read_excel('./Paul_new_data/Sydney_all_questions.xlsx')
# Sydney_additionalLTISet_all_questions=pd.read_excel('./Paul_new_data/Sydney_additionalLTISet_all_questions.xlsx')
name_list = ["cardiff_all_question"]#, "Sydney_all_questions", "Sydney_additionalLTISet_all_questions"]
# cardiff_all_question.rename(columns={0:'id',1:'course_id',2:'timestamp',3:'user',4:'avg_rating',5:'total_responses',6:'total_ratings',7:'top_rating_count',8:'avg_difficulty',9:'total_comments',10:'deleted',11:'answer',12:'numAlts',13:'question',14:'altA',15:'altB',16:'altC',17:'altD',18:'altE',19:'explanation'},inplace=1)
# Sydney_all_questions.rename(columns={0:'id',1:'course_id',2:'timestamp',3:'user',4:'avg_rating',5:'total_responses',6:'total_ratings',7:'top_rating_count',8:'avg_difficulty',9:'total_comments',10:'deleted',11:'answer',12:'numAlts',13:'question',14:'altA',15:'altB',16:'altC',17:'altD',18:'altE',19:'explanation'},inplace=1)
# Sydney_additionalLTISet_all_questions.rename(columns={0:'id',1:'course_id',2:'timestamp',3:'user',4:'avg_rating',5:'total_responses',6:'total_ratings',7:'top_rating_count',8:'avg_difficulty',9:'total_comments',10:'deleted',11:'answer',12:'numAlts',13:'question',14:'altA',15:'altB',16:'altC',17:'altD',18:'altE',19:'explanation'},inplace=1)
cardiff_all_question_list = []
# Sydney_all_questions_list = []
# Sydney_additionalLTISet_all_questions_list = []
total_questions = [cardiff_all_question]#, Sydney_all_questions, Sydney_additionalLTISet_all_questions]
total_list = [cardiff_all_question_list]#, Sydney_all_questions_list, Sydney_additionalLTISet_all_questions_list]
for i in range(1):
for index, row in total_questions[i].iterrows():
question = str(row["question"])
numAlts = row["numAlts"]
total_ratings = row["total_ratings"]
avg_rating = row["avg_rating"]
altA = str(row["altA"])
altB = str(row["altB"])
altC = str(row["altC"])
altD = str(row["altD"])
altE = str(row["altE"])
answer = str(row["answer"])
explanation = str(row["explanation"])
if str(question) == 'nan' or '<img' in str(question) or '<img' in str(altA) or '<img' in str(altB) \
or '<img' in str(altC) or '<img' in str(altD) or '<img' in str(altE) or '<img' in str(explanation) \
or total_ratings < 10:
continue
question = BeautifulSoup(question, "html.parser").get_text().strip()
numAlts = int(numAlts)
altA = BeautifulSoup(altA, "html.parser").get_text().strip()
altB = BeautifulSoup(altB, "html.parser").get_text().strip()
altC = BeautifulSoup(altC, "html.parser").get_text().strip()
altD = BeautifulSoup(altD, "html.parser").get_text().strip()
altE = BeautifulSoup(altE, "html.parser").get_text().strip()
answer = BeautifulSoup(answer, "html.parser").get_text().strip()
explanation = BeautifulSoup(explanation, "html.parser").get_text().strip()
question = question.replace("\u00a0", " ")
altA = altA.replace("\u00a0", " ")
altB = altB.replace("\u00a0", " ")
altC = altC.replace("\u00a0", " ")
altD = altD.replace("\u00a0", " ")
altE = altE.replace("\u00a0", " ")
explanation = explanation.replace("\u00a0", " ")
if explanation == "" or avg_rating < 3 or len(explanation.split()) < 10:
continue
if numAlts == 1:
total_list[i].append({
"instruction": "As an explanation generation expert, can you generate the explanation for the given input?",
"input": "Given question: " + question + " Option A: " + altA + " The correct answer is Option " + answer + ".",
"output": explanation
})
elif numAlts == 2:
total_list[i].append({
"instruction": "As an explanation generation expert, can you generate the explanation for the given input?",
"input": "Given question: " + question + " Option A: " + altA + " Option B: " + altB + " The correct answer is Option " + answer + ".",
"output": explanation
})
elif numAlts == 3:
total_list[i].append({
"instruction": "As an explanation generation expert, can you generate the explanation for the given input?",
"input": "Given question: " + question + " Option A: " + altA + " Option B: " + altB + " Option C: " + altC + " The correct answer is Option " + answer + ".",
"output": explanation
})
elif numAlts == 4:
total_list[i].append({
"instruction": "As an explanation generation expert, can you generate the explanation for the given input?",
"input": "Given question: " + question + " Option A: " + altA + " Option B: " + altB + " Option C: " + altC + " Option D: " + altD + " The correct answer is Option " + answer + ".",
"output": explanation
})
elif numAlts == 5:
total_list[i].append({
"instruction": "As an explanation generation expert, can you generate the explanation for the given input?",
"input": "Given question: " + question + " Option A: " + altA + " Option B: " + altB + " Option C: " + altC + " Option D: " + altD + " Option E: " + altE + " The correct answer is Option " + answer + ".",
"output": explanation
})
# for i in range(3):
# with open('./Paul_new_data/'+str(name_list[i])+'.json', "w") as f:
# json.dump(total_list[i], f, indent=4)
final_total_list = total_list[0] #+ total_list[1] + total_list[2]
random.shuffle(final_total_list)
split_index = int(len(final_total_list) * 0.8)
train_list = final_total_list[:split_index]
test_list = final_total_list[split_index:]
## 80% data from the final_total_list will be used for training
with open('./Paul_new_data/Cardiff_generator_train_avg_3_lenexp_10.json', "w") as f:
json.dump(train_list, f, indent=4)
## 20% data from the final_total_list will be used for testing and evaluation
with open('./Paul_new_data/Cardiff_generator_test_avg_3_lenexp_10.json', "w") as f:
json.dump(test_list, f, indent=4)