-
Notifications
You must be signed in to change notification settings - Fork 82
/
create_dataset.py
104 lines (76 loc) · 2.93 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import sys
import argparse
import collections
import pickle
from pprint import pprint
import pandas as pd
from tqdm import tqdm
from ..utils import misc
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--input')
parser.add_argument('--output')
parser.add_argument('--brain-diff', type=float)
return parser.parse_args()
def show_distribution(dataset):
counter = collections.defaultdict(int)
for row in dataset.itertuples():
for label in row.labels.split():
counter[label] += 1
if not row.labels:
counter['negative'] += 1
counter['all'] += 1
pprint(counter)
def parse_position(df):
expanded = df.ImagePositionPatient.apply(lambda x: pd.Series(x))
expanded.columns = ['Position1', 'Position2', 'Position3']
return pd.concat([df, expanded], axis=1)
def parse_orientation(df):
expanded = df.ImageOrientationPatient.apply(lambda x: pd.Series(x))
expanded.columns = ['Orient1', 'Orient2', 'Orient3', 'Orient4', 'Orient5', 'Orient6']
return pd.concat([df, expanded], axis=1)
def add_adjacent_labels(df):
df = df.sort_values('PositionOrd')
records = []
print('making adjacent labels...')
for index,group in tqdm(df.groupby('StudyInstanceUID')):
labels = list(group.labels)
for j,id in enumerate(group.ID):
if j == 0:
left = ''
else:
left = labels[j-1]
if j+1 == len(labels):
right = ''
else:
right = labels[j+1]
records.append({
'LeftLabel': left,
'RightLabel': right,
'ID': id,
})
return pd.merge(df, pd.DataFrame(records), on='ID')
def main():
args = get_args()
with open(args.input, 'rb') as f:
df = pickle.load(f)
print('read %s (%d records)' % (args.input, len(df)))
show_distribution(df)
if args.brain_diff:
df = df[df.brain_diff > args.brain_diff]
print('excluded records by brain_diff (%d records now)' % len(df))
df = parse_position(df)
df['WindowCenter'] = df.WindowCenter.apply(lambda x: misc.get_dicom_value(x))
df['WindowWidth'] = df.WindowWidth.apply(lambda x: misc.get_dicom_value(x))
df['PositionOrd'] = df.groupby('SeriesInstanceUID')[['Position3']].rank() / df.groupby('SeriesInstanceUID')[['Position3']].transform('count')
df = add_adjacent_labels(df)
df = df[['ID', 'labels', 'PatientID', 'WindowCenter', 'WindowWidth', 'RescaleIntercept', 'RescaleSlope', 'Position3', 'PositionOrd', 'LeftLabel', 'RightLabel', 'BitsStored', 'PixelRepresentation', 'brain_ratio', 'brain_diff']]
show_distribution(df)
df = df.sort_values('ID')
with open(args.output, 'wb') as f:
pickle.dump(df, f)
print('created dataset (%d records)' % len(df))
print('saved to %s' % args.output)
if __name__ == '__main__':
print(sys.argv)
main()