forked from mananjhaveri/NBME-Score-Clinical-Patient-Notes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_training_samples_exact_match.py
69 lines (51 loc) · 2.41 KB
/
generate_training_samples_exact_match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import ast
import joblib
import pandas as pd
features = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')
train = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/train.csv')
pn = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')
pn_dict = {}
for idx, row in pn.iterrows():
pn_dict[row['pn_num']] = row['pn_history']
FOR_PP = {}
new_annotation = []
for case_id in features['case_num'].unique():
all_pn_id = set(pn[pn['case_num']==case_id]['pn_num'].tolist())
for feature_id in features[features['case_num']==case_id]['feature_num'].unique():
# get all the pn_num that have already been annotated
annotated_pn = set(train[train['feature_num']==feature_id]['pn_num'].tolist())
# get all the pn_num that have NOT been annotated
pn_to_annotate = all_pn_id-annotated_pn
# get all current annotations
# we will use them to find more annotations
annotations = train[train['feature_num'] == feature_id]['annotation'].tolist()
annotation_texts = set()
for a in annotations:
anns = eval(a)
for at in anns:
annotation_texts.add(at)
FOR_PP[feature_id] = FOR_PP.get(feature_id, []) + [at]
# annotate
for pn_id in pn_to_annotate:
new_annotation_pn, new_location_pn = [], []
pn_text = pn_dict[pn_id]
for at in annotation_texts:
start = pn_text.find(at)
if start>=0:
new_annotation_pn.append(at)
new_location_pn.append(f'{start} {start+len(at)}')
if len(new_annotation_pn)>0:
new_annotation.append((
f'{pn_id:04d}_{feature_id:03d}',
case_id,
pn_id,
feature_id,
new_annotation_pn,
new_location_pn
))
new_training_data = pd.DataFrame(new_annotation, columns=train.columns)
new_training_data['annotation'] = new_training_data['annotation'].astype(str)
new_training_data.to_csv('exact_match_train_data.csv', index=False)
df = pd.concat([train, new_training_data]).reset_index(drop=True)
df['annotation'] = df['annotation'].astype(str)
joblib.dump(FOR_PP, 'annots_for_pp.bin')