-
Notifications
You must be signed in to change notification settings - Fork 82
/
dicom_to_dataframe.py
129 lines (101 loc) · 3.33 KB
/
dicom_to_dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import sys
import argparse
import functools
import pickle
from multiprocessing import Pool
import copy
import pydicom
import pandas as pd
from tqdm import tqdm
import numpy as np
np.seterr(over='ignore')
from ..utils import misc
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--input', help='provided by kaggle, stage_1_train.csv for stage1')
parser.add_argument('--output')
parser.add_argument('--imgdir')
parser.add_argument('--n-pool', default=6, type=int)
parser.add_argument('--nrows', default=None, type=int)
return parser.parse_args()
def group_labels_by_id(df):
ids = {}
for row in tqdm(df.itertuples(), total=len(df)):
prefix, id, label = row.ID.split('_')
id = '%s_%s' % (prefix, id)
if id not in ids:
ids[id] = []
if row.Label == 1:
ids[id].append(label)
return ids
def remove_corrupted_images(ids):
ids = ids.copy()
for id in ['ID_6431af929']:
try:
ids.pop(id)
except KeyError as e:
print('%s not found' % id)
else:
print('removed %s' % id)
return ids
def create_record(item, dirname):
id, labels = item
path = '%s/%s.dcm' % (dirname, id)
dicom = pydicom.dcmread(path)
record = {
'ID': id,
'labels': ' '.join(labels),
'n_label': len(labels),
}
record.update(misc.get_dicom_raw(dicom))
raw = dicom.pixel_array
slope = float(record['RescaleSlope'])
intercept = float(record['RescaleIntercept'])
center = misc.get_dicom_value(record['WindowCenter'])
width = misc.get_dicom_value(record['WindowWidth'])
bits= record['BitsStored']
pixel = record['PixelRepresentation']
image = misc.rescale_image(raw, slope, intercept, bits, pixel)
doctor = misc.apply_window(image, center, width)
brain = misc.apply_window(image, 40, 80)
record.update({
'raw_max': raw.max(),
'raw_min': raw.min(),
'raw_mean': raw.mean(),
'raw_diff': raw.max() - raw.min(),
'doctor_max': doctor.max(),
'doctor_min': doctor.min(),
'doctor_mean': doctor.mean(),
'doctor_diff': doctor.max() - doctor.min(),
'brain_max': brain.max(),
'brain_min': brain.min(),
'brain_mean': brain.mean(),
'brain_diff': brain.max() - brain.min(),
'brain_ratio': misc.get_windowed_ratio(image, 40, 80),
})
return record
def create_df(ids, args):
print('making records...')
with Pool(args.n_pool) as pool:
records = list(tqdm(
iterable=pool.imap_unordered(
functools.partial(create_record, dirname=args.imgdir),
ids.items()
),
total=len(ids),
))
return pd.DataFrame(records).sort_values('ID').reset_index(drop=True)
def main():
args = get_args()
df_input = pd.read_csv(args.input, nrows=args.nrows)
print('read %s (%d records)' % (args.input, len(df_input)))
ids = group_labels_by_id(df_input)
ids = remove_corrupted_images(ids)
df_output = create_df(ids, args)
with open(args.output, 'wb') as f:
pickle.dump(df_output, f)
print('converted dicom to dataframe (%d records)' % len(df_output))
print('saved to %s' % args.output)
if __name__ == '__main__':
print(sys.argv)
main()