-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDLWMA_pre3_partition_data.py
138 lines (112 loc) · 5.17 KB
/
DLWMA_pre3_partition_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
# this script partition the dataset into N subsamples for n-fold cross_validation.
# the partition is based on patient.
# the output is a excel spreadsheet listing all subsamples.
import os
import numpy as np
import supplement
import random
import function_list_VR as ff
import pandas as pd
from sklearn.model_selection import train_test_split
cg = supplement.Experiment()
excel_file = pd.read_excel(os.path.join(cg.save_dir,'Patient_List/movie_list_w_classes_w_picked_timeframes_train.xlsx'))
# get patient_list
patient_list = [excel_file.iloc[i]['Patient_ID'] for i in range(excel_file.shape[0])]
patient_list = np.unique(np.asarray(patient_list))
# if partition not done
# randomly partition while ensuring the class ratio in each subsample is close to the populational class ratio
# calculate the populational ratio for each view angle
populational_ratio = []
for angle in [0,60,120,180,240,300]:
data_list = excel_file.loc[excel_file['angle'] == angle]
abnormal_list = data_list.loc[data_list['class'] == 'abnormal']
populational_ratio.append(abnormal_list.shape[0] / data_list.shape[0])
all_angle_populational_ratio = sum(populational_ratio) / len(populational_ratio)
print(populational_ratio,all_angle_populational_ratio)
# random shuffle until criterias are satisfied
std = 0.06
while 1:
satisfy = 0
seed = np.random.randint(100000)
np.random.seed(seed)
np.random.shuffle(patient_list)
patient_list_split = np.array_split(patient_list,cg.num_partitions)
Ratio = []; ALL_ANGLE_RATIO = []
for ii in range(0,len(patient_list_split)):
patient_group = patient_list_split[ii]
group_ratio = []
for angle in [0,60,120,180,240,300]:
abnormal_count = 0
for p in patient_group:
case = excel_file.loc[(excel_file['Patient_ID'] == p) & (excel_file['angle'] == angle)]
if case.iloc[0]['class'] == 'abnormal':
abnormal_count += 1
group_ratio.append(abnormal_count / patient_group.shape[0])
all_angle_group_ratio = sum(group_ratio) / len(group_ratio)
Ratio.append(group_ratio)
ALL_ANGLE_RATIO.append(all_angle_group_ratio)
# check whether satisfy
check_all_angle = []
# check all_angle first
for ii in range(0,len(patient_list_split)):
a = ALL_ANGLE_RATIO[ii]
if (a <= (all_angle_populational_ratio + std)) and (a >= (all_angle_populational_ratio - std)):
check_all_angle.append(1)
else:
check_all_angle.append(0)
# check per_angle
check_per_angle = []
for ii in range(0,len(patient_list_split)):
for jj in range(0,6): # 6 angles
a = Ratio[ii][jj]
if (a <= (populational_ratio[jj] + std)) and (a >= (populational_ratio[jj] - std)):
check_per_angle.append(1)
else:
check_per_angle.append(0)
print(check_all_angle,np.where(np.asarray(check_per_angle) == 0)[0].shape[0] )
# satisfication criteria
if (np.all(np.asarray(check_all_angle)) == True) and np.where(np.asarray(check_per_angle) == 0)[0].shape[0] <= 3:
satisfy = 1
if satisfy == 1:
print('seed is ', seed, ' ratios are: ', Ratio, ALL_ANGLE_RATIO)
break
# save the partition results into numpy files
np_save_folder = os.path.join(cg.save_dir,'partitions/angle_all')
ff.make_folder([os.path.basename(np_save_folder),np_save_folder])
batch_list = []
for i in range(cg.num_partitions):
batch_list.append(patient_list_split[i])
print(batch_list[i].shape)
np.save(os.path.join(np_save_folder,'batch_'+str(i)+'.npy'), batch_list[i])
# save the partition results into data_file.xlsx
batch_file = []
for i in range(0,excel_file.shape[0]):
case = excel_file.iloc[i]
for batch in range(cg.num_partitions):
if np.isin(case['Patient_ID'],patient_list_split[batch]) == 1:
B = batch
batch_file.append([B, case['video_name']])
batch_file = pd.DataFrame(batch_file,columns = ['batch','video_name'])
df = pd.merge(batch_file,excel_file,on = 'video_name')
df.to_excel(os.path.join(cg.save_dir,'Patient_List/data_file_angle_all_train.xlsx'),index=False)
# if already have partitions done:
# numpy_files = ff.find_all_target_files(['*.npy'],os.path.join(cg.save_dir,'partitions/angle_all'))
# patient_list_split = []
# for b in range(0,5):
# numpy_file = ff.find_all_target_files(['batch_'+str(b)+'.npy'],os.path.join(cg.save_dir,'partitions/angle_all'))
# bb = np.load(numpy_file[0])
# patient_list_split.append(bb)
# print(patient_list_split)
# # save into data_file.xlsx
# batch_file = []
# for i in range(0,excel_file.shape[0]):
# case = excel_file.iloc[i]
# #print(case['video_name'])
# for batch in range(cg.num_partitions):
# if np.isin(case['Patient_ID'],patient_list_split[batch]) == 1:
# B = batch
# batch_file.append([B, case['video_name']])
# batch_file = pd.DataFrame(batch_file,columns = ['batch','video_name'])
# df = pd.merge(batch_file,excel_file,on = 'video_name')
# df.to_excel(os.path.join(cg.save__dir,'Patient_List/data_file_classes_train.xlsx'),index=False)