-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate_patches.py
71 lines (51 loc) · 2.47 KB
/
generate_patches.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import torchvision.transforms as T
from patient_dataset import PatientDataset
import os
import pandas as pd
from glob import glob
import re
import argparse
from tqdm import tqdm
from joblib import Parallel, delayed
def save_file(args, dataset, i):
patch, _, _ = dataset[i]
T.ToPILImage()(patch).save(f'{args.output_path}/real/{i}.png')
def main():
args = parse_args()
# Load the patient outcomes
patient_outcomes = pd.read_excel(f'{args.data_path}/outcomes.xlsx', 'Sheet1')
# Filter any patients that don't have an SVS file
slide_ids = [re.sub(r'\.svs', '', os.path.basename(slide)) for slide in glob(f'{args.data_path}/svs/*.svs')]
patient_outcomes = patient_outcomes[patient_outcomes['slide_UUID'].isin(slide_ids)]
# Load all patient creatinine files
creatinine_files = glob(f'{args.data_path}/creatinine/*.xlsx')
patient_creatinine = {}
for file in creatinine_files:
df = pd.read_excel(file, 'Sheet1')
file_name = os.path.basename(file)
patient_id = re.sub(r'\.xlsx$', '', file_name)
patient_creatinine[patient_id] = df
# Filter any creatinine files that don't have an outcome
patient_creatinine = {k: v for k, v in patient_creatinine.items() if k in patient_outcomes['patient_UUID'].values}
print(f'Found {len(patient_outcomes)} patients with SVS files')
# Load the labelled data from the h5 labelbox download
patient_labelled_dir = f'{args.data_path}/results.h5'
# Initialise PatientDataset
dataset = PatientDataset(patient_outcomes, patient_creatinine, f'{args.data_path}/svs/', patient_labelled_dir, patch_size=1024, image_size=1024, annotated_dataset=args.annotated_dataset, transformations=False)
if args.annotated_dataset:
print('Using ANNOTATED dataset')
else:
print('Using UNANNOTATED dataset')
try:
os.makedirs(f"{args.output_path}/real")
except FileExistsError:
pass
result = Parallel(n_jobs=128)(delayed(save_file)(args, dataset, i) for i in tqdm(range(len(dataset))))
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, help='Path of training dataset')
parser.add_argument('--output_path', type=str, help='Path where patches will be saved')
parser.add_argument('--annotated_dataset', action='store_true', help='Use annotated dataset')
return parser.parse_args()
if __name__ == '__main__':
main()