From 57e43c45b70fc3c608ac7671f01b18c36d7883b1 Mon Sep 17 00:00:00 2001 From: Sam Freesun Friedman Date: Tue, 5 Dec 2023 16:27:24 -0500 Subject: [PATCH 01/20] add pancreas --- ml4h/tensorize/tensor_writer_ukbb.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py index a352022e7..f47aee7b8 100755 --- a/ml4h/tensorize/tensor_writer_ukbb.py +++ b/ml4h/tensorize/tensor_writer_ukbb.py @@ -62,6 +62,7 @@ 'shmolli_192i_b7_sax_b7s_sax_b7s_sax_b7s_t1map', ] +MRI_PANCREAS_SERIES = ['shmolli_192i_pancreas_t1map'] MRI_CARDIAC_SERIES_SEGMENTED = [series+'_segmented' for series in MRI_CARDIAC_SERIES] MRI_BRAIN_SERIES = ['t1_p2_1mm_fov256_sag_ti_880', 't2_flair_sag_p2_1mm_fs_ellip_pf78'] MRI_NIFTI_FIELD_ID_TO_ROOT = {'20251': 'SWI', '20252': 'T1', '20253': 'T2_FLAIR'} @@ -455,6 +456,8 @@ def _write_tensors_from_dicoms( mri_group = 'ukb_liver_mri' elif v in MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED: mri_group = 'ukb_cardiac_mri' + elif v in MRI_PANCREAS_SERIES: + mri_group = 'ukb_pancreas_mri' else: mri_group = 'ukb_mri' From 943909ccb9bd498a5cdbf0e450174122b4359c06 Mon Sep 17 00:00:00 2001 From: Sam Freesun Friedman Date: Tue, 5 Dec 2023 16:29:44 -0500 Subject: [PATCH 02/20] add pancreas --- ml4h/tensorize/tensor_writer_ukbb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py index f47aee7b8..100d5fc8c 100755 --- a/ml4h/tensorize/tensor_writer_ukbb.py +++ b/ml4h/tensorize/tensor_writer_ukbb.py @@ -72,7 +72,7 @@ DICOM_MRI_FIELDS = [ '20209', '20208', '20210', '20212', '20213', '20214', '20204', '20203', '20254', '20216', '20220', '20218', - '20227', '20225', '20217', '20158', + '20227', '20225', '20217', '20158', '20259' ] DXA_FIELD = '20158' From 804015b88353e4133afbe38ec5b698f983243799 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Wed, 6 Dec 2023 10:32:02 -0500 Subject: [PATCH 03/20] use pancreas for pngs --- ml4h/tensorize/tensor_writer_ukbb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py index 100d5fc8c..c4009cd3b 100755 --- a/ml4h/tensorize/tensor_writer_ukbb.py +++ b/ml4h/tensorize/tensor_writer_ukbb.py @@ -72,7 +72,7 @@ DICOM_MRI_FIELDS = [ '20209', '20208', '20210', '20212', '20213', '20214', '20204', '20203', '20254', '20216', '20220', '20218', - '20227', '20225', '20217', '20158', '20259' + '20227', '20225', '20217', '20158', '20259', ] DXA_FIELD = '20158' @@ -174,7 +174,7 @@ def write_tensors_from_dicom_pngs( tensors, png_path, manifest_tsv, series, min_sample_id, max_sample_id, x=256, y=256, sample_header='sample_id', dicom_header='dicom_file', instance_header='instance_number', png_postfix='.png.mask.png', - path_prefix='ukb_cardiac_mri', + path_prefix='ukb_pancreas_mri', ): stats = Counter() reader = csv.reader(open(manifest_tsv), delimiter='\t') From 91810999f1909838f5efc1c891cb529b93301607 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Wed, 6 Dec 2023 10:51:07 -0500 Subject: [PATCH 04/20] WIP --- ml4h/tensorize/tensor_writer_ukbb.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py index c4009cd3b..fd6fac806 100755 --- a/ml4h/tensorize/tensor_writer_ukbb.py +++ b/ml4h/tensorize/tensor_writer_ukbb.py @@ -172,7 +172,7 @@ def write_tensors( def write_tensors_from_dicom_pngs( tensors, png_path, manifest_tsv, series, min_sample_id, max_sample_id, x=256, y=256, - sample_header='sample_id', dicom_header='dicom_file', + sample_header='', dicom_header='SOP Instance UID', instance_header='instance_number', png_postfix='.png.mask.png', path_prefix='ukb_pancreas_mri', ): @@ -180,15 +180,24 @@ def write_tensors_from_dicom_pngs( reader = csv.reader(open(manifest_tsv), delimiter='\t') header = next(reader) logging.info(f"DICOM Manifest Header is:{header}") - instance_index = header.index(instance_header) + # instance_index = header.index(instance_header) sample_index = header.index(sample_header) dicom_index = header.index(dicom_header) for row in reader: - sample_id = row[sample_index] + sample_id = row[sample_index].split('/')[8].split('_')[0] if not min_sample_id <= int(sample_id) < max_sample_id: continue stats[sample_header + '_' + sample_id] += 1 - dicom_file = row[dicom_index] + if 'train' in png_path: + dicom_file = row[dicom_index] + '.dcm' + elif 'valid' in png_path: + search_file = os.path.join(png_path, f'*_{sample_id}_*') + dicom_file = glob.glob(search_file) + if len(dicom_file) > 0: + assert (len(dicom_file) == 1) + dicom_file = dicom_file[0].split('.')[0] + else: + dicom_file = search_file try: png = imageio.imread(os.path.join(png_path, dicom_file + png_postfix)) full_tensor = np.zeros((x, y), dtype=np.float32) @@ -197,7 +206,7 @@ def write_tensors_from_dicom_pngs( if not os.path.exists(os.path.dirname(tensor_file)): os.makedirs(os.path.dirname(tensor_file)) with h5py.File(tensor_file, 'a') as hd5: - tensor_name = series + '_annotated_' + row[instance_index] + tensor_name = series + '_annotated_' + '2' #row[instance_index] tp = tensor_path(path_prefix, tensor_name) if tp in hd5: tensor = first_dataset_at_path(hd5, tp) From 8753eceaec280094914ee7c06c3eed9cf65a8bb7 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Thu, 7 Dec 2023 09:09:00 -0500 Subject: [PATCH 05/20] New manifest file --- ml4h/tensorize/tensor_writer_ukbb.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py index fd6fac806..fef207b2e 100755 --- a/ml4h/tensorize/tensor_writer_ukbb.py +++ b/ml4h/tensorize/tensor_writer_ukbb.py @@ -172,24 +172,24 @@ def write_tensors( def write_tensors_from_dicom_pngs( tensors, png_path, manifest_tsv, series, min_sample_id, max_sample_id, x=256, y=256, - sample_header='', dicom_header='SOP Instance UID', - instance_header='instance_number', png_postfix='.png.mask.png', + sample_header='sample_id', dicom_header='dicom_file', + instance_header='instance', png_postfix='.png.mask.png', path_prefix='ukb_pancreas_mri', ): stats = Counter() - reader = csv.reader(open(manifest_tsv), delimiter='\t') + reader = csv.reader(open(manifest_tsv), delimiter=' ') header = next(reader) logging.info(f"DICOM Manifest Header is:{header}") - # instance_index = header.index(instance_header) + instance_index = header.index(instance_header) sample_index = header.index(sample_header) dicom_index = header.index(dicom_header) for row in reader: - sample_id = row[sample_index].split('/')[8].split('_')[0] + sample_id = row[sample_index] if not min_sample_id <= int(sample_id) < max_sample_id: continue stats[sample_header + '_' + sample_id] += 1 if 'train' in png_path: - dicom_file = row[dicom_index] + '.dcm' + dicom_file = row[dicom_index] elif 'valid' in png_path: search_file = os.path.join(png_path, f'*_{sample_id}_*') dicom_file = glob.glob(search_file) @@ -206,7 +206,7 @@ def write_tensors_from_dicom_pngs( if not os.path.exists(os.path.dirname(tensor_file)): os.makedirs(os.path.dirname(tensor_file)) with h5py.File(tensor_file, 'a') as hd5: - tensor_name = series + '_annotated_' + '2' #row[instance_index] + tensor_name = series + '_annotated_' + row[instance_index] tp = tensor_path(path_prefix, tensor_name) if tp in hd5: tensor = first_dataset_at_path(hd5, tp) From b9321cb1d888f74d50e5efc67b342273739681f1 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Wed, 13 Dec 2023 13:36:16 -0500 Subject: [PATCH 06/20] Fix view creation --- ml4h/tensorize/tensor_writer_ukbb.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py index fef207b2e..a3c5b03ec 100755 --- a/ml4h/tensorize/tensor_writer_ukbb.py +++ b/ml4h/tensorize/tensor_writer_ukbb.py @@ -443,7 +443,7 @@ def _write_tensors_from_dicoms( if series + '_12bit' in MRI_LIVER_SERIES_12BIT and d.LargestImagePixelValue > 2048: views[series + '_12bit'].append(d) stats[series + '_12bit'] += 1 - elif series in MRI_LIVER_SERIES + MRI_CARDIAC_SERIES + MRI_BRAIN_SERIES: + elif series in MRI_LIVER_SERIES + MRI_CARDIAC_SERIES + MRI_BRAIN_SERIES + MRI_PANCREAS_SERIES: views[series].append(d) stats[series] += 1 elif series == 'dxa_images': @@ -576,14 +576,14 @@ def _tensorize_brain_mri(slices: List[pydicom.Dataset], series: str, mri_date: d def _save_pixel_dimensions_if_missing(slicer, series, hd5): - if MRI_PIXEL_WIDTH + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT: + if MRI_PIXEL_WIDTH + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT + MRI_PANCREAS_SERIES: hd5.create_dataset(MRI_PIXEL_WIDTH + '_' + series, data=float(slicer.PixelSpacing[0])) - if MRI_PIXEL_HEIGHT + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT: + if MRI_PIXEL_HEIGHT + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT + MRI_PANCREAS_SERIES: hd5.create_dataset(MRI_PIXEL_HEIGHT + '_' + series, data=float(slicer.PixelSpacing[1])) def _save_slice_thickness_if_missing(slicer, series, hd5): - if MRI_SLICE_THICKNESS + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT: + if MRI_SLICE_THICKNESS + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT + MRI_PANCREAS_SERIES: hd5.create_dataset(MRI_SLICE_THICKNESS + '_' + series, data=float(slicer.SliceThickness)) @@ -593,9 +593,9 @@ def _save_series_orientation_and_position_if_missing(slicer, series, hd5, instan if instance: orientation_ds_name += HD5_GROUP_CHAR + instance position_ds_name += HD5_GROUP_CHAR + instance - if orientation_ds_name not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT: + if orientation_ds_name not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT + MRI_PANCREAS_SERIES: hd5.create_dataset(orientation_ds_name, data=[float(x) for x in slicer.ImageOrientationPatient]) - if position_ds_name not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT: + if position_ds_name not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT + MRI_PANCREAS_SERIES: hd5.create_dataset(position_ds_name, data=[float(x) for x in slicer.ImagePositionPatient]) From afaab76a00c2cb8e7dd46f8c41fe754e9bc32a45 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Wed, 13 Dec 2023 19:11:06 -0500 Subject: [PATCH 07/20] Prevent tensorize from overwriting --- ml4h/tensorize/tensor_writer_ukbb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py index a3c5b03ec..479641012 100755 --- a/ml4h/tensorize/tensor_writer_ukbb.py +++ b/ml4h/tensorize/tensor_writer_ukbb.py @@ -137,7 +137,7 @@ def write_tensors( if _prune_sample(sample_id, min_sample_id, max_sample_id, mri_field_ids, xml_field_ids, zip_folder, xml_folder): continue try: - with h5py.File(tp, 'w') as hd5: + with h5py.File(tp, 'a') as hd5: _write_tensors_from_zipped_dicoms(write_pngs, tensors, mri_unzip, mri_field_ids, zip_folder, hd5, sample_id, stats) _write_tensors_from_zipped_niftis(zip_folder, mri_field_ids, hd5, sample_id, stats) _write_tensors_from_xml(xml_field_ids, xml_folder, hd5, sample_id, write_pngs, stats, continuous_stats) From 87258a3813def950fd3292646a82118ebb429ce5 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Fri, 15 Dec 2023 14:02:04 +0000 Subject: [PATCH 08/20] Add tensormaps for pancreas mris --- ml4h/defines.py | 5 ++- ml4h/tensormap/ukb/mri.py | 82 ++++++++++++++++++++++++++++++++------- 2 files changed, 73 insertions(+), 14 deletions(-) diff --git a/ml4h/defines.py b/ml4h/defines.py index eb18dde92..6081639f1 100755 --- a/ml4h/defines.py +++ b/ml4h/defines.py @@ -100,7 +100,10 @@ def __str__(self): 'aortic_root': 7, 'ascending_aorta': 8, 'pulmonary_artery': 9, 'ascending_aortic_wall': 10, 'LVOT': 11, } MRI_LIVER_SEGMENTED_CHANNEL_MAP = {'background': 0, 'liver': 1, 'inferior_vena_cava': 2, 'abdominal_aorta': 3, 'body': 4} - +MRI_PANCREAS_SEGMENTED_CHANNEL_MAP = { + 'background': 0, 'body': 1, 'pancreas': 2, 'liver': 3, 'stomach': 4, 'spleen': 5, + 'kidney': 6, 'bowel': 7, 'spine': 8, 'aorta':9, 'ivc': 10, +} # TODO: These values should ultimately come from the coding table CODING_VALUES_LESS_THAN_ONE = [-10, -1001] diff --git a/ml4h/tensormap/ukb/mri.py b/ml4h/tensormap/ukb/mri.py index 788b768ba..35339266e 100755 --- a/ml4h/tensormap/ukb/mri.py +++ b/ml4h/tensormap/ukb/mri.py @@ -20,7 +20,8 @@ MRI_LAX_2CH_SEGMENTED_CHANNEL_MAP, MRI_SAX_SEGMENTED_CHANNEL_MAP, LAX_4CH_HEART_LABELS, LAX_4CH_MYOCARDIUM_LABELS, StorageType, LAX_3CH_HEART_LABELS, \ LAX_2CH_HEART_LABELS from ml4h.tensormap.general import get_tensor_at_first_date, normalized_first_date, pad_or_crop_array_to_shape, tensor_from_hd5 -from ml4h.defines import MRI_LAX_3CH_SEGMENTED_CHANNEL_MAP, MRI_LAX_4CH_SEGMENTED_CHANNEL_MAP, MRI_SAX_PAP_SEGMENTED_CHANNEL_MAP, MRI_AO_SEGMENTED_CHANNEL_MAP, MRI_LIVER_SEGMENTED_CHANNEL_MAP, SAX_HEART_LABELS +from ml4h.defines import MRI_LAX_3CH_SEGMENTED_CHANNEL_MAP, MRI_LAX_4CH_SEGMENTED_CHANNEL_MAP, MRI_SAX_PAP_SEGMENTED_CHANNEL_MAP, \ + MRI_AO_SEGMENTED_CHANNEL_MAP, MRI_LIVER_SEGMENTED_CHANNEL_MAP, SAX_HEART_LABELS, MRI_PANCREAS_SEGMENTED_CHANNEL_MAP def _slice_subset_tensor( @@ -2669,17 +2670,12 @@ def _mdrk_projection_both_views_pretrained(tm, hd5, dependents={}): tensor_from_file=None, ) -def _pad_crop_single_channel(tm, hd5, dependents={}): - if f'/{tm.path_prefix}/shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' in hd5: - key_prefix = f'/{tm.path_prefix}/shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' - elif f'/{tm.path_prefix}/shmolli_192i_b2_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' in hd5: - key_prefix = f'/{tm.path_prefix}/shmolli_192i_b2_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' - else: - raise ValueError(f'Could not find T1 Map image for tensormap: {tm.name}') - +def _pad_crop_single_channel(tm, hd5, key_prefix=None, dependents={}): + if key_prefix is None: + key_prefix = tm.hd5_key_guess() img = np.array( - tm.hd5_first_dataset_in_group(hd5, key_prefix), - dtype=np.float32, + tm.hd5_first_dataset_in_group(hd5, key_prefix), + dtype=np.float32, ) img = img[...,[1]] return pad_or_crop_array_to_shape( @@ -2687,15 +2683,32 @@ def _pad_crop_single_channel(tm, hd5, dependents={}): img, ) +def _pad_crop_single_channel_t1map_b2(tm, hd5, dependents={}): + if f'/{tm.path_prefix}/shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' in hd5: + key_prefix = f'/{tm.path_prefix}/shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' + elif f'/{tm.path_prefix}/shmolli_192i_b2_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' in hd5: + key_prefix = f'/{tm.path_prefix}/shmolli_192i_b2_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' + else: + raise ValueError(f'Could not find T1 Map image for tensormap: {tm.name}') + return _pad_crop_single_channel(tm, hd5, key_prefix, dependents) + t1map_b2 = TensorMap( 'shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map', shape=(384, 384, 1), path_prefix='ukb_cardiac_mri', normalization=Standardize(mean=455.81, std=609.50), + tensor_from_file=_pad_crop_single_channel_t1map_b2, +) + +t1map_pancreas = TensorMap( + 'shmolli_192i_pancreas_t1map', + shape=(288, 384, 1), + path_prefix='ukb_pancreas_mri', + normalization=Standardize(mean=389.49, std=658.36), tensor_from_file=_pad_crop_single_channel, ) -def _segmented_t1map(tm, hd5, dependents={}): +def _segmented_t1map_b2(tm, hd5, dependents={}): if f'{tm.path_prefix}/{tm.name}_1' in hd5: categorical_index_slice = get_tensor_at_first_date(hd5, tm.path_prefix, f'{tm.name}_1') elif f'{tm.path_prefix}/{tm.name}_2' in hd5: @@ -2715,13 +2728,56 @@ def _segmented_t1map(tm, hd5, dependents={}): tensor[..., :] = pad_or_crop_array_to_shape(tensor[..., :].shape, categorical_one_hot) return tensor +def _segmented_t1map_pancreas(tm, hd5, dependents={}): + if f'{tm.path_prefix}/{tm.name}' in hd5: + categorical_index_slice = get_tensor_at_first_date(hd5, tm.path_prefix, f'{tm.name}') + else: + raise ValueError(f'Could not find T1 Map segmentation for tensormap: {tm.name}') + + # remove kidney label and merge body/background labels + orig_num_channels = len(tm.channel_map) + 3 + categorical_one_hot = to_categorical(categorical_index_slice, orig_num_channels) + categorical_one_hot[..., 6] += ( + categorical_one_hot[..., 11] + + categorical_one_hot[..., 12] + + categorical_one_hot[..., 13] + ) + categorical_one_hot = np.delete(categorical_one_hot, [11, 12, 13], axis=-1) + + # padding/cropping + tensor = np.zeros(tm.shape, dtype=np.float32) + tensor[..., :] = pad_or_crop_array_to_shape(tensor[..., :].shape, categorical_one_hot) + return tensor + t1map_b2_segmentation = TensorMap( 'b2s_t1map_kassir_annotated', interpretation=Interpretation.CATEGORICAL, shape=(384, 384, len(MRI_SAX_PAP_SEGMENTED_CHANNEL_MAP)), channel_map=MRI_SAX_PAP_SEGMENTED_CHANNEL_MAP, path_prefix='ukb_cardiac_mri', - tensor_from_file=_segmented_t1map, + tensor_from_file=_segmented_t1map_b2, loss=dice, metrics=['categorical_accuracy'] + per_class_dice(MRI_SAX_PAP_SEGMENTED_CHANNEL_MAP), ) + +t1map_pancreas_segmentation_cce = TensorMap( + 'shmolli_192i_pancreas_t1map_annotated_2', + interpretation=Interpretation.CATEGORICAL, + shape=(288, 384, len(MRI_PANCREAS_SEGMENTED_CHANNEL_MAP)), + channel_map=MRI_PANCREAS_SEGMENTED_CHANNEL_MAP, + path_prefix='ukb_pancreas_mri', + tensor_from_file=_segmented_t1map_pancreas, + loss='categorical_crossentropy', + metrics=['categorical_accuracy'] + per_class_dice(MRI_PANCREAS_SEGMENTED_CHANNEL_MAP), +) + +t1map_pancreas_segmentation_dice = TensorMap( + 'shmolli_192i_pancreas_t1map_annotated_2', + interpretation=Interpretation.CATEGORICAL, + shape=(288, 384, len(MRI_PANCREAS_SEGMENTED_CHANNEL_MAP)), + channel_map=MRI_PANCREAS_SEGMENTED_CHANNEL_MAP, + path_prefix='ukb_pancreas_mri', + tensor_from_file=_segmented_t1map_pancreas, + loss=dice, + metrics=['categorical_accuracy'] + per_class_dice(MRI_PANCREAS_SEGMENTED_CHANNEL_MAP), +) From e7cd9f06a0c9009b4911e0593581f5d01a18c3bb Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Fri, 15 Dec 2023 15:53:47 +0000 Subject: [PATCH 09/20] Add elements of Marcus's setup - L2 weight_decay and cosine decay learning rate schedule --- ml4h/arguments.py | 2 +- ml4h/models/layer_wrappers.py | 8 +++++++- ml4h/models/legacy_models.py | 7 ++++++- ml4h/optimizers.py | 3 +++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/ml4h/arguments.py b/ml4h/arguments.py index 39cdf50b3..4f5dffb3f 100755 --- a/ml4h/arguments.py +++ b/ml4h/arguments.py @@ -263,7 +263,7 @@ def parse_args(): ) parser.add_argument('--balance_csvs', default=[], nargs='*', help='Balances batches with representation from sample IDs in this list of CSVs') parser.add_argument('--optimizer', default='radam', type=str, help='Optimizer for model training') - parser.add_argument('--learning_rate_schedule', default=None, type=str, choices=['triangular', 'triangular2'], help='Adjusts learning rate during training.') + parser.add_argument('--learning_rate_schedule', default=None, type=str, choices=['triangular', 'triangular2', 'cosine_decay'], help='Adjusts learning rate during training.') parser.add_argument('--anneal_rate', default=0., type=float, help='Annealing rate in epochs of loss terms during training') parser.add_argument('--anneal_shift', default=0., type=float, help='Annealing offset in epochs of loss terms during training') parser.add_argument('--anneal_max', default=2.0, type=float, help='Annealing maximum value') diff --git a/ml4h/models/layer_wrappers.py b/ml4h/models/layer_wrappers.py index 7ba1187c0..083765a03 100755 --- a/ml4h/models/layer_wrappers.py +++ b/ml4h/models/layer_wrappers.py @@ -25,6 +25,7 @@ from tensorflow.keras.layers import MaxPooling2D, MaxPooling3D, Average, AveragePooling1D, AveragePooling2D, AveragePooling3D, Layer from tensorflow.keras.layers import SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Concatenate, Add from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalAveragePooling2D, GlobalAveragePooling3D +from tensorflow.keras.regularizers import L1, L2 Tensor = tf.Tensor @@ -52,9 +53,14 @@ # class name -> (dimension -> class) 'spatial_dropout': {2: SpatialDropout1D, 3: SpatialDropout2D, 4: SpatialDropout3D}, 'dropout': defaultdict(lambda _: Dropout), + 'l1': L1, + 'l2': L2, } DENSE_REGULARIZATION_CLASSES = { - 'dropout': Dropout, # TODO: add l1, l2 + 'dropout': Dropout, + 'dropout': Dropout, + 'l1': L1, + 'l2': L2, } diff --git a/ml4h/models/legacy_models.py b/ml4h/models/legacy_models.py index 3a9aab20c..2b975e1e3 100755 --- a/ml4h/models/legacy_models.py +++ b/ml4h/models/legacy_models.py @@ -30,6 +30,7 @@ from tensorflow.keras.layers import SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Concatenate, Add from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalAveragePooling2D, GlobalAveragePooling3D from tensorflow.keras.layers.experimental.preprocessing import RandomRotation, RandomZoom, RandomContrast +from tensorflow.keras.regularizers import L1, L2 import tensorflow_probability as tfp from ml4h.metrics import get_metric_dict @@ -79,9 +80,13 @@ class BottleneckType(Enum): # class name -> (dimension -> class) 'spatial_dropout': {2: SpatialDropout1D, 3: SpatialDropout2D, 4: SpatialDropout3D}, 'dropout': defaultdict(lambda _: Dropout), + 'l1': L1, + 'l2': L2, } DENSE_REGULARIZATION_CLASSES = { - 'dropout': Dropout, # TODO: add l1, l2 + 'dropout': Dropout, + 'l1': L1, + 'l2': L2, } diff --git a/ml4h/optimizers.py b/ml4h/optimizers.py index c9e79d5f0..5dc130c2b 100755 --- a/ml4h/optimizers.py +++ b/ml4h/optimizers.py @@ -6,6 +6,7 @@ from tensorflow.keras import backend as K from tensorflow.keras.models import Model from tensorflow_addons.optimizers import RectifiedAdam, TriangularCyclicalLearningRate, Triangular2CyclicalLearningRate +from tensorflow.keras.optimizers.schedules import CosineDecay from ml4h.plots import plot_find_learning_rate from ml4h.tensor_generators import TensorGenerator @@ -40,6 +41,8 @@ def _get_learning_rate_schedule(learning_rate: float, learning_rate_schedule: st initial_learning_rate=learning_rate / 5, maximal_learning_rate=learning_rate, step_size=steps_per_epoch * 5, ) + if learning_rate_schedule == 'cosine_decay': + return CosineDecay(initial_learning_rate=learning_rate, decay_steps=steps_per_epoch) else: raise ValueError(f'Learning rate schedule "{learning_rate_schedule}" unknown.') From 30507295d4f8319bfe1af8c927aa5a6b7249a6cd Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Fri, 15 Dec 2023 18:38:29 +0000 Subject: [PATCH 10/20] FIX: Fix bug when a generator has 0 ids --- ml4h/tensor_generators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml4h/tensor_generators.py b/ml4h/tensor_generators.py index 7e698ddf8..f88617724 100755 --- a/ml4h/tensor_generators.py +++ b/ml4h/tensor_generators.py @@ -95,7 +95,7 @@ def __init__( :param paths: If weights is provided, paths should be a list of path lists the same length as weights """ self.augment = augment - self.paths = sum(paths) if isinstance(paths[0], list) else paths + self.paths = sum(paths) if (len(paths) > 0 and isinstance(paths[0], list)) else paths self.run_on_main_thread = num_workers == 0 self.q = None self.stats_q = None From 4a951e92bdfb3a7528a3832b2c770735c513c012 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Fri, 15 Dec 2023 18:38:56 +0000 Subject: [PATCH 11/20] FIX: Fix bug when key_prefix is not given --- ml4h/tensormap/ukb/mri.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ml4h/tensormap/ukb/mri.py b/ml4h/tensormap/ukb/mri.py index 35339266e..861f868ae 100755 --- a/ml4h/tensormap/ukb/mri.py +++ b/ml4h/tensormap/ukb/mri.py @@ -2670,7 +2670,7 @@ def _mdrk_projection_both_views_pretrained(tm, hd5, dependents={}): tensor_from_file=None, ) -def _pad_crop_single_channel(tm, hd5, key_prefix=None, dependents={}): +def _pad_crop_single_channel(tm, hd5, dependents={}, key_prefix=None): if key_prefix is None: key_prefix = tm.hd5_key_guess() img = np.array( @@ -2690,7 +2690,7 @@ def _pad_crop_single_channel_t1map_b2(tm, hd5, dependents={}): key_prefix = f'/{tm.path_prefix}/shmolli_192i_b2_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' else: raise ValueError(f'Could not find T1 Map image for tensormap: {tm.name}') - return _pad_crop_single_channel(tm, hd5, key_prefix, dependents) + return _pad_crop_single_channel(tm, hd5, dependents, key_prefix) t1map_b2 = TensorMap( 'shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map', From e6684e46ea48833b22323e67e89f98b59079deb0 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Fri, 15 Dec 2023 18:49:43 +0000 Subject: [PATCH 12/20] ENH: Allow for no testing during model training --- ml4h/recipes.py | 62 +++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/ml4h/recipes.py b/ml4h/recipes.py index 33ca54b7f..1058357ae 100755 --- a/ml4h/recipes.py +++ b/ml4h/recipes.py @@ -205,37 +205,39 @@ def train_multimodal_multitask(args): if merger: merger.save(f'{args.output_folder}{args.id}/merger.h5') - test_data, test_labels, test_paths = big_batch_from_minibatch_generator(generate_test, args.test_steps) - performance_metrics = _predict_and_evaluate( - model, test_data, test_labels, args.tensor_maps_in, args.tensor_maps_out, args.tensor_maps_protected, - args.batch_size, args.hidden_layer, os.path.join(args.output_folder, args.id + '/'), test_paths, - args.embed_visualization, args.alpha, args.dpi, args.plot_width, args.plot_height, - ) + performance_metrics = {} + if args.test_steps > 0: + test_data, test_labels, test_paths = big_batch_from_minibatch_generator(generate_test, args.test_steps) + performance_metrics = _predict_and_evaluate( + model, test_data, test_labels, args.tensor_maps_in, args.tensor_maps_out, args.tensor_maps_protected, + args.batch_size, args.hidden_layer, os.path.join(args.output_folder, args.id + '/'), test_paths, + args.embed_visualization, args.alpha, args.dpi, args.plot_width, args.plot_height, + ) - predictions_list = model.predict(test_data) - samples = min(args.test_steps * args.batch_size, 12) - out_path = os.path.join(args.output_folder, args.id, 'reconstructions/') - if len(args.tensor_maps_out) == 1: - predictions_list = [predictions_list] - predictions_dict = {name: pred for name, pred in zip(model.output_names, predictions_list)} - logging.info(f'Predictions and shapes are: {[(p, predictions_dict[p].shape) for p in predictions_dict]}') - - for i, etm in enumerate(encoders): - embed = encoders[etm].predict(test_data[etm.input_name()]) - if etm.output_name() in predictions_dict: - plot_reconstruction(etm, test_data[etm.input_name()], predictions_dict[etm.output_name()], out_path, test_paths, samples) - for dtm in decoders: - reconstruction = decoders[dtm].predict(embed) - logging.info(f'{dtm.name} has prediction shape: {reconstruction.shape} from embed shape: {embed.shape}') - my_out_path = os.path.join(out_path, f'decoding_{dtm.name}_from_{etm.name}/') - os.makedirs(os.path.dirname(my_out_path), exist_ok=True) - if dtm.axes() > 1: - plot_reconstruction(dtm, test_labels[dtm.output_name()], reconstruction, my_out_path, test_paths, samples) - else: - evaluate_predictions( - dtm, reconstruction, test_labels[dtm.output_name()], {}, dtm.name, my_out_path, - test_paths, dpi=args.dpi, width=args.plot_width, height=args.plot_height, - ) + predictions_list = model.predict(test_data) + samples = min(args.test_steps * args.batch_size, 12) + out_path = os.path.join(args.output_folder, args.id, 'reconstructions/') + if len(args.tensor_maps_out) == 1: + predictions_list = [predictions_list] + predictions_dict = {name: pred for name, pred in zip(model.output_names, predictions_list)} + logging.info(f'Predictions and shapes are: {[(p, predictions_dict[p].shape) for p in predictions_dict]}') + + for i, etm in enumerate(encoders): + embed = encoders[etm].predict(test_data[etm.input_name()]) + if etm.output_name() in predictions_dict: + plot_reconstruction(etm, test_data[etm.input_name()], predictions_dict[etm.output_name()], out_path, test_paths, samples) + for dtm in decoders: + reconstruction = decoders[dtm].predict(embed) + logging.info(f'{dtm.name} has prediction shape: {reconstruction.shape} from embed shape: {embed.shape}') + my_out_path = os.path.join(out_path, f'decoding_{dtm.name}_from_{etm.name}/') + os.makedirs(os.path.dirname(my_out_path), exist_ok=True) + if dtm.axes() > 1: + plot_reconstruction(dtm, test_labels[dtm.output_name()], reconstruction, my_out_path, test_paths, samples) + else: + evaluate_predictions( + dtm, reconstruction, test_labels[dtm.output_name()], {}, dtm.name, my_out_path, + test_paths, dpi=args.dpi, width=args.plot_width, height=args.plot_height, + ) return performance_metrics From 78839dda42a9deeca72f603891f968950e1060a0 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Fri, 8 Dec 2023 17:10:12 +0000 Subject: [PATCH 13/20] FIX: Fix typo --- ml4h/explorations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml4h/explorations.py b/ml4h/explorations.py index 5da14d5d9..612aaa0c0 100755 --- a/ml4h/explorations.py +++ b/ml4h/explorations.py @@ -896,7 +896,7 @@ def infer_stats_from_segmented_regions(args): if args.analyze_ground_truth: _scatter_plots_from_segmented_region_stats( inference_tsv_true, inference_tsv_pred, args.structures_to_analyze, - args.output_folder, args.id, tm_in.input_name(), args.output_name, + args.output_folder, args.id, tm_in.input_name(), tm_out.output_name(), ) def _softmax(x): From 6e70e83fe0cdaa4e7401f3bf7f0f199b45197def Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Fri, 8 Dec 2023 17:29:55 +0000 Subject: [PATCH 14/20] FIX: Fix parser for boolean arguments --- ml4h/arguments.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ml4h/arguments.py b/ml4h/arguments.py index 4f5dffb3f..ef40373e5 100755 --- a/ml4h/arguments.py +++ b/ml4h/arguments.py @@ -385,7 +385,9 @@ def parse_args(): ) # Arguments for explorations/infer_stats_from_segmented_regions - parser.add_argument('--analyze_ground_truth', default=True, help='Whether or not to filter by images with ground truth segmentations, for comparison') + parser.add_argument('--analyze_ground_truth', action='store_true', help='Filter by images with ground truth segmentations, for comparison') + parser.add_argument('--no_analyze_ground_truth', dest='analyze_ground_truth', action='store_false', help='Do not filter by images with ground truth segmentations, for comparison') + parser.set_defaults(analyze_ground_truth=True) parser.add_argument('--structures_to_analyze', nargs='*', default=[], help='Structure names to include in the .tsv files and scatter plots') parser.add_argument('--erosion_radius', default=1, type=int, help='Radius of the unit disk structuring element for erosion preprocessing') parser.add_argument('--intensity_thresh', type=float, help='Threshold value for preprocessing') From 4b8654377e4dde3eed63dfaef69d22559fa2c7b2 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Tue, 19 Dec 2023 20:13:40 -0500 Subject: [PATCH 15/20] STYLE: Remove unneeded comment --- ml4h/explorations.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ml4h/explorations.py b/ml4h/explorations.py index 612aaa0c0..1787014da 100755 --- a/ml4h/explorations.py +++ b/ml4h/explorations.py @@ -798,7 +798,6 @@ def infer_stats_from_segmented_regions(args): assert(tm_in.shape[-1] == 1, 'no support here for stats on multiple input channels') # don't filter datasets for ground truth segmentations if we want to run inference on everything - # TODO HELP - this isn't giving me all 56K anymore if not args.analyze_ground_truth: args.output_tensors = [] args.tensor_maps_out = [] From 05c4f77b410f2efe7f9d3dcda5939de36dc7b857 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Wed, 20 Dec 2023 02:19:59 +0000 Subject: [PATCH 16/20] WIP --- ml4h/arguments.py | 2 ++ ml4h/explorations.py | 38 +++++++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/ml4h/arguments.py b/ml4h/arguments.py index ef40373e5..2eae88e5c 100755 --- a/ml4h/arguments.py +++ b/ml4h/arguments.py @@ -391,6 +391,8 @@ def parse_args(): parser.add_argument('--structures_to_analyze', nargs='*', default=[], help='Structure names to include in the .tsv files and scatter plots') parser.add_argument('--erosion_radius', default=1, type=int, help='Radius of the unit disk structuring element for erosion preprocessing') parser.add_argument('--intensity_thresh', type=float, help='Threshold value for preprocessing') + parser.add_argument('--intensity_thresh_perc', type=float, help='Threshold percentile for preprocessing, between 0 and 100 inclusive') + parser.add_argument('--intensity_thresh_k_means', nargs='*', default=[], type=int, help='Preprocessing using k-means specified as two numbers, the first is the number of clusters and the second is the cluster index to keep') parser.add_argument('--intensity_thresh_in_structures', nargs='*', default=[], help='Structure names whose pixels should be replaced if the images has intensity above the threshold') parser.add_argument('--intensity_thresh_out_structure', help='Replacement structure name') diff --git a/ml4h/explorations.py b/ml4h/explorations.py index 1787014da..662679990 100755 --- a/ml4h/explorations.py +++ b/ml4h/explorations.py @@ -20,6 +20,7 @@ import pandas as pd import multiprocessing as mp from sklearn.decomposition import PCA +from sklearn.cluster import KMeans from tensorflow.keras.models import Model @@ -719,13 +720,26 @@ def _get_csv_row(sample_id, means, medians, stds, date): csv_row = [sample_id] + res[0].astype('str').tolist() + [date] return csv_row -def _thresh_labels_above(y, img, intensity_thresh, in_labels, out_label, nb_orig_channels): +def _thresh_labels_above(y, img, intensity_thresh, intensity_thresh_perc, in_labels, out_label, nb_orig_channels): y = np.argmax(y, axis=-1)[..., np.newaxis] - y[np.logical_and(img >= intensity_thresh, np.isin(y, in_labels))] = out_label + if intensity_thresh: + img_intensity_thresh = intensity_thresh + elif intensity_thresh_perc: + img_intensity_thresh = np.percentile(img, intensity_thresh_perc) + y[np.logical_and(img >= img_intensity_thresh, np.isin(y, in_labels))] = out_label y = y[..., 0] y = _to_categorical(y, nb_orig_channels) return y +def _intensity_thresh_k_means(y, img, intensity_thresh_k_means): + X = img[y==1][...,np.newaxis] + if X.size > 1: + kmeans = KMeans(n_clusters=intensity_thresh_k_means[0], random_state=0, n_init="auto").fit(X) + labels = kmeans.predict(img.flatten()[...,np.newaxis]) + labels = np.reshape(labels, img.shape) + y[np.logical_and(labels==intensity_thresh_k_means[1], y==1)] = 0 + return y + def _scatter_plots_from_segmented_region_stats( inference_tsv_true, inference_tsv_pred, structures_to_analyze, output_folder, id, input_name, output_name, @@ -759,13 +773,9 @@ def _scatter_plots_from_segmented_region_stats( title = col.replace('_', ' ') ax.set_xlabel(f'{title} T1 Time (ms) - Manual Segmentation') ax.set_ylabel(f'{title} T1 Time (ms) - Model Segmentation') - if i == 'all': - min_value = -50 - max_value = 1300 - elif i == 'filter_outliers': - min_value, max_value = plot_data.min(), plot_data.max() - min_value = min([min_value['true'], min_value['pred']]) - 100 - max_value = min([max_value['true'], max_value['pred']]) + 100 + min_value, max_value = plot_data.min(), plot_data.max() + min_value = min([min_value['true'], min_value['pred']]) - 100 + max_value = min([max_value['true'], max_value['pred']]) + 100 ax.set_xlim([min_value, max_value]) ax.set_ylim([min_value, max_value]) res = stats.pearsonr(plot_data['true'], plot_data['pred']) @@ -819,6 +829,8 @@ def infer_stats_from_segmented_regions(args): # Setup for intensity thresholding do_intensity_thresh = args.intensity_thresh_in_structures and args.intensity_thresh_out_structure if do_intensity_thresh: + assert (not (args.intensity_thresh and args.intensity_thresh_perc)) + assert (not (args.intensity_thresh_k_means and len(args.intensity_thresh_in_structures) > 1)) intensity_thresh_in_channels = [tm_out.channel_map[k] for k in args.intensity_thresh_in_structures] intensity_thresh_out_channel = tm_out.channel_map[args.intensity_thresh_out_structure] @@ -869,19 +881,23 @@ def infer_stats_from_segmented_regions(args): if args.analyze_ground_truth: if do_intensity_thresh: - y_true = _thresh_labels_above(y_true, img, args.intensity_thresh, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels) + y_true = _thresh_labels_above(y_true, img, args.intensity_thresh, args.intensity_thresh_perc, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels) y_true = np.delete(y_true, bad_channels, axis=-1) if args.erosion_radius > 0: y_true = binary_erosion(y_true, structure).astype(y_true.dtype) + if args.intensity_thresh_k_means: + y_true = _intensity_thresh_k_means(y_true, img, args.intensity_thresh_k_means) means_true, medians_true, stds_true = _compute_masked_stats(rescaled_img, y_true, nb_good_channels) csv_row_true = _get_csv_row(sample_id, means_true, medians_true, stds_true, date) inference_writer_true.writerow(csv_row_true) if do_intensity_thresh: - y_pred = _thresh_labels_above(y_pred, img, args.intensity_thresh, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels) + y_pred = _thresh_labels_above(y_pred, img, args.intensity_thresh, args.intensity_thresh_perc, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels) y_pred = np.delete(y_pred, bad_channels, axis=-1) if args.erosion_radius > 0: y_pred = binary_erosion(y_pred, structure).astype(y_pred.dtype) + if args.intensity_thresh_k_means: + y_pred = _intensity_thresh_k_means(y_pred, img, args.intensity_thresh_k_means) means_pred, medians_pred, stds_pred = _compute_masked_stats(rescaled_img, y_pred, nb_good_channels) csv_row_pred = _get_csv_row(sample_id, means_pred, medians_pred, stds_pred, date) inference_writer_pred.writerow(csv_row_pred) From ca2e89b488a9c981042b34e3286c79847ad2229a Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Wed, 10 Jan 2024 21:00:25 +0000 Subject: [PATCH 17/20] FIX error (which kills a thread and prevents subsequent pngs from being written) if the image size is wrong --- ml4h/tensorize/tensor_writer_ukbb.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py index 479641012..acf97b7bc 100755 --- a/ml4h/tensorize/tensor_writer_ukbb.py +++ b/ml4h/tensorize/tensor_writer_ukbb.py @@ -219,6 +219,9 @@ def write_tensors_from_dicom_pngs( except FileNotFoundError: logging.warning(f'Could not find file: {os.path.join(png_path, dicom_file + png_postfix)}') stats['File not found error'] += 1 + except ValueError: + logging.warning(f'Could not convert file: {os.path.join(png_path, dicom_file + png_postfix)}') + stats['Value error'] += 1 for k in stats: if sample_header in k and stats[k] == 50: continue From 05c5975875d18783a8e8347e2a71d5bfbbb5ad49 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Wed, 10 Jan 2024 21:43:23 +0000 Subject: [PATCH 18/20] Tensorize can create empty tensors if there are no good series, making you think it's working when it isn't. At least give a warning --- ml4h/tensorize/tensor_writer_ukbb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py index acf97b7bc..c4bb29418 100755 --- a/ml4h/tensorize/tensor_writer_ukbb.py +++ b/ml4h/tensorize/tensor_writer_ukbb.py @@ -454,6 +454,8 @@ def _write_tensors_from_dicoms( dxa_number = dicom.split('.')[-4] name = f'dxa_{series_num}_{dxa_number}' create_tensor_in_hd5(hd5, f'ukb_dxa/', name, d.pixel_array, stats) + else: + stats[f'Could not process series {series}'] += 1 if series in MRI_LIVER_IDEAL_PROTOCOL: min_ideal_series = min(min_ideal_series, int(d.SeriesNumber)) From 8eaa344bafa36d9ce70710d13fbc9771e87ce5be Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Thu, 11 Jan 2024 10:15:33 -0500 Subject: [PATCH 19/20] Don't commit code to interpet this specific manifest_tsv file --- ml4h/tensorize/tensor_writer_ukbb.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py index c4bb29418..fc56b3f71 100755 --- a/ml4h/tensorize/tensor_writer_ukbb.py +++ b/ml4h/tensorize/tensor_writer_ukbb.py @@ -173,11 +173,11 @@ def write_tensors( def write_tensors_from_dicom_pngs( tensors, png_path, manifest_tsv, series, min_sample_id, max_sample_id, x=256, y=256, sample_header='sample_id', dicom_header='dicom_file', - instance_header='instance', png_postfix='.png.mask.png', - path_prefix='ukb_pancreas_mri', + instance_header='instance_number', png_postfix='.png.mask.png', + path_prefix='ukb_cardiac_mri', ): stats = Counter() - reader = csv.reader(open(manifest_tsv), delimiter=' ') + reader = csv.reader(open(manifest_tsv), delimiter='\t') header = next(reader) logging.info(f"DICOM Manifest Header is:{header}") instance_index = header.index(instance_header) @@ -188,16 +188,7 @@ def write_tensors_from_dicom_pngs( if not min_sample_id <= int(sample_id) < max_sample_id: continue stats[sample_header + '_' + sample_id] += 1 - if 'train' in png_path: - dicom_file = row[dicom_index] - elif 'valid' in png_path: - search_file = os.path.join(png_path, f'*_{sample_id}_*') - dicom_file = glob.glob(search_file) - if len(dicom_file) > 0: - assert (len(dicom_file) == 1) - dicom_file = dicom_file[0].split('.')[0] - else: - dicom_file = search_file + dicom_file = row[dicom_index] try: png = imageio.imread(os.path.join(png_path, dicom_file + png_postfix)) full_tensor = np.zeros((x, y), dtype=np.float32) From fa9303b0bd28ef36d229fc58448fa24efcdbd4f6 Mon Sep 17 00:00:00 2001 From: Danielle Pace Date: Fri, 12 Jan 2024 11:11:36 -0500 Subject: [PATCH 20/20] STYLE: Rename intensity_thresh_perc -> intensity_thresh_percentile --- ml4h/arguments.py | 2 +- ml4h/explorations.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ml4h/arguments.py b/ml4h/arguments.py index 2eae88e5c..88cca51e7 100755 --- a/ml4h/arguments.py +++ b/ml4h/arguments.py @@ -391,7 +391,7 @@ def parse_args(): parser.add_argument('--structures_to_analyze', nargs='*', default=[], help='Structure names to include in the .tsv files and scatter plots') parser.add_argument('--erosion_radius', default=1, type=int, help='Radius of the unit disk structuring element for erosion preprocessing') parser.add_argument('--intensity_thresh', type=float, help='Threshold value for preprocessing') - parser.add_argument('--intensity_thresh_perc', type=float, help='Threshold percentile for preprocessing, between 0 and 100 inclusive') + parser.add_argument('--intensity_thresh_percentile', type=float, help='Threshold percentile for preprocessing, between 0 and 100 inclusive') parser.add_argument('--intensity_thresh_k_means', nargs='*', default=[], type=int, help='Preprocessing using k-means specified as two numbers, the first is the number of clusters and the second is the cluster index to keep') parser.add_argument('--intensity_thresh_in_structures', nargs='*', default=[], help='Structure names whose pixels should be replaced if the images has intensity above the threshold') parser.add_argument('--intensity_thresh_out_structure', help='Replacement structure name') diff --git a/ml4h/explorations.py b/ml4h/explorations.py index 662679990..924a1e5b5 100755 --- a/ml4h/explorations.py +++ b/ml4h/explorations.py @@ -720,12 +720,12 @@ def _get_csv_row(sample_id, means, medians, stds, date): csv_row = [sample_id] + res[0].astype('str').tolist() + [date] return csv_row -def _thresh_labels_above(y, img, intensity_thresh, intensity_thresh_perc, in_labels, out_label, nb_orig_channels): +def _thresh_labels_above(y, img, intensity_thresh, intensity_thresh_percentile, in_labels, out_label, nb_orig_channels): y = np.argmax(y, axis=-1)[..., np.newaxis] if intensity_thresh: img_intensity_thresh = intensity_thresh - elif intensity_thresh_perc: - img_intensity_thresh = np.percentile(img, intensity_thresh_perc) + elif intensity_thresh_percentile: + img_intensity_thresh = np.percentile(img, intensity_thresh_percentile) y[np.logical_and(img >= img_intensity_thresh, np.isin(y, in_labels))] = out_label y = y[..., 0] y = _to_categorical(y, nb_orig_channels) @@ -829,7 +829,7 @@ def infer_stats_from_segmented_regions(args): # Setup for intensity thresholding do_intensity_thresh = args.intensity_thresh_in_structures and args.intensity_thresh_out_structure if do_intensity_thresh: - assert (not (args.intensity_thresh and args.intensity_thresh_perc)) + assert (not (args.intensity_thresh and args.intensity_thresh_percentile)) assert (not (args.intensity_thresh_k_means and len(args.intensity_thresh_in_structures) > 1)) intensity_thresh_in_channels = [tm_out.channel_map[k] for k in args.intensity_thresh_in_structures] intensity_thresh_out_channel = tm_out.channel_map[args.intensity_thresh_out_structure] @@ -881,7 +881,7 @@ def infer_stats_from_segmented_regions(args): if args.analyze_ground_truth: if do_intensity_thresh: - y_true = _thresh_labels_above(y_true, img, args.intensity_thresh, args.intensity_thresh_perc, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels) + y_true = _thresh_labels_above(y_true, img, args.intensity_thresh, args.intensity_thresh_percentile, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels) y_true = np.delete(y_true, bad_channels, axis=-1) if args.erosion_radius > 0: y_true = binary_erosion(y_true, structure).astype(y_true.dtype) @@ -892,7 +892,7 @@ def infer_stats_from_segmented_regions(args): inference_writer_true.writerow(csv_row_true) if do_intensity_thresh: - y_pred = _thresh_labels_above(y_pred, img, args.intensity_thresh, args.intensity_thresh_perc, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels) + y_pred = _thresh_labels_above(y_pred, img, args.intensity_thresh, args.intensity_thresh_percentile, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels) y_pred = np.delete(y_pred, bad_channels, axis=-1) if args.erosion_radius > 0: y_pred = binary_erosion(y_pred, structure).astype(y_pred.dtype)