From 57e43c45b70fc3c608ac7671f01b18c36d7883b1 Mon Sep 17 00:00:00 2001
From: Sam Freesun Friedman <lucidtronix@gmail.com>
Date: Tue, 5 Dec 2023 16:27:24 -0500
Subject: [PATCH 01/20] add pancreas

---
 ml4h/tensorize/tensor_writer_ukbb.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py
index a352022e7..f47aee7b8 100755
--- a/ml4h/tensorize/tensor_writer_ukbb.py
+++ b/ml4h/tensorize/tensor_writer_ukbb.py
@@ -62,6 +62,7 @@
     'shmolli_192i_b7_sax_b7s_sax_b7s_sax_b7s_t1map',
 
 ]
+MRI_PANCREAS_SERIES = ['shmolli_192i_pancreas_t1map']
 MRI_CARDIAC_SERIES_SEGMENTED = [series+'_segmented' for series in MRI_CARDIAC_SERIES]
 MRI_BRAIN_SERIES = ['t1_p2_1mm_fov256_sag_ti_880', 't2_flair_sag_p2_1mm_fs_ellip_pf78']
 MRI_NIFTI_FIELD_ID_TO_ROOT = {'20251': 'SWI', '20252': 'T1', '20253': 'T2_FLAIR'}
@@ -455,6 +456,8 @@ def _write_tensors_from_dicoms(
             mri_group = 'ukb_liver_mri'
         elif v in MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED:
             mri_group = 'ukb_cardiac_mri'
+        elif v in MRI_PANCREAS_SERIES:
+            mri_group = 'ukb_pancreas_mri'
         else:
             mri_group = 'ukb_mri'
 

From 943909ccb9bd498a5cdbf0e450174122b4359c06 Mon Sep 17 00:00:00 2001
From: Sam Freesun Friedman <lucidtronix@gmail.com>
Date: Tue, 5 Dec 2023 16:29:44 -0500
Subject: [PATCH 02/20] add pancreas

---
 ml4h/tensorize/tensor_writer_ukbb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py
index f47aee7b8..100d5fc8c 100755
--- a/ml4h/tensorize/tensor_writer_ukbb.py
+++ b/ml4h/tensorize/tensor_writer_ukbb.py
@@ -72,7 +72,7 @@
 
 DICOM_MRI_FIELDS = [
     '20209', '20208', '20210', '20212', '20213', '20214', '20204', '20203', '20254', '20216', '20220', '20218',
-    '20227', '20225', '20217', '20158',
+    '20227', '20225', '20217', '20158', '20259'
 ]
 
 DXA_FIELD = '20158'

From 804015b88353e4133afbe38ec5b698f983243799 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Wed, 6 Dec 2023 10:32:02 -0500
Subject: [PATCH 03/20] use pancreas for pngs

---
 ml4h/tensorize/tensor_writer_ukbb.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py
index 100d5fc8c..c4009cd3b 100755
--- a/ml4h/tensorize/tensor_writer_ukbb.py
+++ b/ml4h/tensorize/tensor_writer_ukbb.py
@@ -72,7 +72,7 @@
 
 DICOM_MRI_FIELDS = [
     '20209', '20208', '20210', '20212', '20213', '20214', '20204', '20203', '20254', '20216', '20220', '20218',
-    '20227', '20225', '20217', '20158', '20259'
+    '20227', '20225', '20217', '20158', '20259',
 ]
 
 DXA_FIELD = '20158'
@@ -174,7 +174,7 @@ def write_tensors_from_dicom_pngs(
     tensors, png_path, manifest_tsv, series, min_sample_id, max_sample_id, x=256, y=256,
     sample_header='sample_id', dicom_header='dicom_file',
     instance_header='instance_number', png_postfix='.png.mask.png',
-    path_prefix='ukb_cardiac_mri',
+    path_prefix='ukb_pancreas_mri',
 ):
     stats = Counter()
     reader = csv.reader(open(manifest_tsv), delimiter='\t')

From 91810999f1909838f5efc1c891cb529b93301607 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Wed, 6 Dec 2023 10:51:07 -0500
Subject: [PATCH 04/20] WIP

---
 ml4h/tensorize/tensor_writer_ukbb.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py
index c4009cd3b..fd6fac806 100755
--- a/ml4h/tensorize/tensor_writer_ukbb.py
+++ b/ml4h/tensorize/tensor_writer_ukbb.py
@@ -172,7 +172,7 @@ def write_tensors(
 
 def write_tensors_from_dicom_pngs(
     tensors, png_path, manifest_tsv, series, min_sample_id, max_sample_id, x=256, y=256,
-    sample_header='sample_id', dicom_header='dicom_file',
+    sample_header='', dicom_header='SOP Instance UID',
     instance_header='instance_number', png_postfix='.png.mask.png',
     path_prefix='ukb_pancreas_mri',
 ):
@@ -180,15 +180,24 @@ def write_tensors_from_dicom_pngs(
     reader = csv.reader(open(manifest_tsv), delimiter='\t')
     header = next(reader)
     logging.info(f"DICOM Manifest Header is:{header}")
-    instance_index = header.index(instance_header)
+    # instance_index = header.index(instance_header)
     sample_index = header.index(sample_header)
     dicom_index = header.index(dicom_header)
     for row in reader:
-        sample_id = row[sample_index]
+        sample_id = row[sample_index].split('/')[8].split('_')[0]
         if not min_sample_id <= int(sample_id) < max_sample_id:
             continue
         stats[sample_header + '_' + sample_id] += 1
-        dicom_file = row[dicom_index]
+        if 'train' in png_path:
+            dicom_file = row[dicom_index] + '.dcm'
+        elif 'valid' in png_path:
+            search_file = os.path.join(png_path, f'*_{sample_id}_*')
+            dicom_file = glob.glob(search_file)
+            if len(dicom_file) > 0:
+                assert (len(dicom_file) == 1)
+                dicom_file = dicom_file[0].split('.')[0]
+            else:
+                dicom_file = search_file
         try:
             png = imageio.imread(os.path.join(png_path, dicom_file + png_postfix))
             full_tensor = np.zeros((x, y), dtype=np.float32)
@@ -197,7 +206,7 @@ def write_tensors_from_dicom_pngs(
             if not os.path.exists(os.path.dirname(tensor_file)):
                 os.makedirs(os.path.dirname(tensor_file))
             with h5py.File(tensor_file, 'a') as hd5:
-                tensor_name = series + '_annotated_' + row[instance_index]
+                tensor_name = series + '_annotated_' + '2' #row[instance_index]
                 tp = tensor_path(path_prefix, tensor_name)
                 if tp in hd5:
                     tensor = first_dataset_at_path(hd5, tp)

From 8753eceaec280094914ee7c06c3eed9cf65a8bb7 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Thu, 7 Dec 2023 09:09:00 -0500
Subject: [PATCH 05/20] New manifest file

---
 ml4h/tensorize/tensor_writer_ukbb.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py
index fd6fac806..fef207b2e 100755
--- a/ml4h/tensorize/tensor_writer_ukbb.py
+++ b/ml4h/tensorize/tensor_writer_ukbb.py
@@ -172,24 +172,24 @@ def write_tensors(
 
 def write_tensors_from_dicom_pngs(
     tensors, png_path, manifest_tsv, series, min_sample_id, max_sample_id, x=256, y=256,
-    sample_header='', dicom_header='SOP Instance UID',
-    instance_header='instance_number', png_postfix='.png.mask.png',
+    sample_header='sample_id', dicom_header='dicom_file',
+    instance_header='instance', png_postfix='.png.mask.png',
     path_prefix='ukb_pancreas_mri',
 ):
     stats = Counter()
-    reader = csv.reader(open(manifest_tsv), delimiter='\t')
+    reader = csv.reader(open(manifest_tsv), delimiter=' ')
     header = next(reader)
     logging.info(f"DICOM Manifest Header is:{header}")
-    # instance_index = header.index(instance_header)
+    instance_index = header.index(instance_header)
     sample_index = header.index(sample_header)
     dicom_index = header.index(dicom_header)
     for row in reader:
-        sample_id = row[sample_index].split('/')[8].split('_')[0]
+        sample_id = row[sample_index]
         if not min_sample_id <= int(sample_id) < max_sample_id:
             continue
         stats[sample_header + '_' + sample_id] += 1
         if 'train' in png_path:
-            dicom_file = row[dicom_index] + '.dcm'
+            dicom_file = row[dicom_index]
         elif 'valid' in png_path:
             search_file = os.path.join(png_path, f'*_{sample_id}_*')
             dicom_file = glob.glob(search_file)
@@ -206,7 +206,7 @@ def write_tensors_from_dicom_pngs(
             if not os.path.exists(os.path.dirname(tensor_file)):
                 os.makedirs(os.path.dirname(tensor_file))
             with h5py.File(tensor_file, 'a') as hd5:
-                tensor_name = series + '_annotated_' + '2' #row[instance_index]
+                tensor_name = series + '_annotated_' + row[instance_index]
                 tp = tensor_path(path_prefix, tensor_name)
                 if tp in hd5:
                     tensor = first_dataset_at_path(hd5, tp)

From b9321cb1d888f74d50e5efc67b342273739681f1 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Wed, 13 Dec 2023 13:36:16 -0500
Subject: [PATCH 06/20] Fix view creation

---
 ml4h/tensorize/tensor_writer_ukbb.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py
index fef207b2e..a3c5b03ec 100755
--- a/ml4h/tensorize/tensor_writer_ukbb.py
+++ b/ml4h/tensorize/tensor_writer_ukbb.py
@@ -443,7 +443,7 @@ def _write_tensors_from_dicoms(
         if series + '_12bit' in MRI_LIVER_SERIES_12BIT and d.LargestImagePixelValue > 2048:
             views[series + '_12bit'].append(d)
             stats[series + '_12bit'] += 1
-        elif series in MRI_LIVER_SERIES + MRI_CARDIAC_SERIES + MRI_BRAIN_SERIES:
+        elif series in MRI_LIVER_SERIES + MRI_CARDIAC_SERIES + MRI_BRAIN_SERIES + MRI_PANCREAS_SERIES:
             views[series].append(d)
             stats[series] += 1
         elif series == 'dxa_images':
@@ -576,14 +576,14 @@ def _tensorize_brain_mri(slices: List[pydicom.Dataset], series: str, mri_date: d
 
 
 def _save_pixel_dimensions_if_missing(slicer, series, hd5):
-    if MRI_PIXEL_WIDTH + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT:
+    if MRI_PIXEL_WIDTH + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT + MRI_PANCREAS_SERIES:
         hd5.create_dataset(MRI_PIXEL_WIDTH + '_' + series, data=float(slicer.PixelSpacing[0]))
-    if MRI_PIXEL_HEIGHT + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT:
+    if MRI_PIXEL_HEIGHT + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT + MRI_PANCREAS_SERIES:
         hd5.create_dataset(MRI_PIXEL_HEIGHT + '_' + series, data=float(slicer.PixelSpacing[1]))
 
 
 def _save_slice_thickness_if_missing(slicer, series, hd5):
-    if MRI_SLICE_THICKNESS + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT:
+    if MRI_SLICE_THICKNESS + '_' + series not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT + MRI_PANCREAS_SERIES:
         hd5.create_dataset(MRI_SLICE_THICKNESS + '_' + series, data=float(slicer.SliceThickness))
 
 
@@ -593,9 +593,9 @@ def _save_series_orientation_and_position_if_missing(slicer, series, hd5, instan
     if instance:
         orientation_ds_name += HD5_GROUP_CHAR + instance
         position_ds_name += HD5_GROUP_CHAR + instance
-    if orientation_ds_name not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT:
+    if orientation_ds_name not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT + MRI_PANCREAS_SERIES:
         hd5.create_dataset(orientation_ds_name, data=[float(x) for x in slicer.ImageOrientationPatient])
-    if position_ds_name not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT:
+    if position_ds_name not in hd5 and series in MRI_BRAIN_SERIES + MRI_CARDIAC_SERIES + MRI_CARDIAC_SERIES_SEGMENTED + MRI_LIVER_SERIES + MRI_LIVER_SERIES_12BIT + MRI_PANCREAS_SERIES:
         hd5.create_dataset(position_ds_name, data=[float(x) for x in slicer.ImagePositionPatient])
 
 

From afaab76a00c2cb8e7dd46f8c41fe754e9bc32a45 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Wed, 13 Dec 2023 19:11:06 -0500
Subject: [PATCH 07/20] Prevent tensorize from overwriting

---
 ml4h/tensorize/tensor_writer_ukbb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py
index a3c5b03ec..479641012 100755
--- a/ml4h/tensorize/tensor_writer_ukbb.py
+++ b/ml4h/tensorize/tensor_writer_ukbb.py
@@ -137,7 +137,7 @@ def write_tensors(
         if _prune_sample(sample_id, min_sample_id, max_sample_id, mri_field_ids, xml_field_ids, zip_folder, xml_folder):
             continue
         try:
-            with h5py.File(tp, 'w') as hd5:
+            with h5py.File(tp, 'a') as hd5:
                 _write_tensors_from_zipped_dicoms(write_pngs, tensors, mri_unzip, mri_field_ids, zip_folder, hd5, sample_id, stats)
                 _write_tensors_from_zipped_niftis(zip_folder, mri_field_ids, hd5, sample_id, stats)
                 _write_tensors_from_xml(xml_field_ids, xml_folder, hd5, sample_id, write_pngs, stats, continuous_stats)

From 87258a3813def950fd3292646a82118ebb429ce5 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Fri, 15 Dec 2023 14:02:04 +0000
Subject: [PATCH 08/20] Add tensormaps for pancreas mris

---
 ml4h/defines.py           |  5 ++-
 ml4h/tensormap/ukb/mri.py | 82 ++++++++++++++++++++++++++++++++-------
 2 files changed, 73 insertions(+), 14 deletions(-)

diff --git a/ml4h/defines.py b/ml4h/defines.py
index eb18dde92..6081639f1 100755
--- a/ml4h/defines.py
+++ b/ml4h/defines.py
@@ -100,7 +100,10 @@ def __str__(self):
     'aortic_root': 7, 'ascending_aorta': 8, 'pulmonary_artery': 9, 'ascending_aortic_wall': 10, 'LVOT': 11,
 }
 MRI_LIVER_SEGMENTED_CHANNEL_MAP = {'background': 0, 'liver': 1, 'inferior_vena_cava': 2, 'abdominal_aorta': 3, 'body': 4}
-
+MRI_PANCREAS_SEGMENTED_CHANNEL_MAP = {
+    'background': 0, 'body': 1, 'pancreas': 2, 'liver': 3, 'stomach': 4, 'spleen': 5,
+    'kidney': 6, 'bowel': 7, 'spine': 8, 'aorta':9, 'ivc': 10,
+}
 
 # TODO: These values should ultimately come from the coding table
 CODING_VALUES_LESS_THAN_ONE = [-10, -1001]
diff --git a/ml4h/tensormap/ukb/mri.py b/ml4h/tensormap/ukb/mri.py
index 788b768ba..35339266e 100755
--- a/ml4h/tensormap/ukb/mri.py
+++ b/ml4h/tensormap/ukb/mri.py
@@ -20,7 +20,8 @@
     MRI_LAX_2CH_SEGMENTED_CHANNEL_MAP, MRI_SAX_SEGMENTED_CHANNEL_MAP, LAX_4CH_HEART_LABELS, LAX_4CH_MYOCARDIUM_LABELS, StorageType, LAX_3CH_HEART_LABELS, \
     LAX_2CH_HEART_LABELS
 from ml4h.tensormap.general import get_tensor_at_first_date, normalized_first_date, pad_or_crop_array_to_shape, tensor_from_hd5
-from ml4h.defines import MRI_LAX_3CH_SEGMENTED_CHANNEL_MAP, MRI_LAX_4CH_SEGMENTED_CHANNEL_MAP, MRI_SAX_PAP_SEGMENTED_CHANNEL_MAP, MRI_AO_SEGMENTED_CHANNEL_MAP, MRI_LIVER_SEGMENTED_CHANNEL_MAP, SAX_HEART_LABELS
+from ml4h.defines import MRI_LAX_3CH_SEGMENTED_CHANNEL_MAP, MRI_LAX_4CH_SEGMENTED_CHANNEL_MAP, MRI_SAX_PAP_SEGMENTED_CHANNEL_MAP, \
+    MRI_AO_SEGMENTED_CHANNEL_MAP, MRI_LIVER_SEGMENTED_CHANNEL_MAP, SAX_HEART_LABELS, MRI_PANCREAS_SEGMENTED_CHANNEL_MAP
 
 
 def _slice_subset_tensor(
@@ -2669,17 +2670,12 @@ def _mdrk_projection_both_views_pretrained(tm, hd5, dependents={}):
     tensor_from_file=None,
 )
 
-def _pad_crop_single_channel(tm, hd5, dependents={}):
-    if f'/{tm.path_prefix}/shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' in hd5:
-        key_prefix = f'/{tm.path_prefix}/shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2'
-    elif f'/{tm.path_prefix}/shmolli_192i_b2_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' in hd5:
-        key_prefix = f'/{tm.path_prefix}/shmolli_192i_b2_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2'
-    else:
-        raise ValueError(f'Could not find T1 Map image for tensormap: {tm.name}')
-
+def _pad_crop_single_channel(tm, hd5, key_prefix=None, dependents={}):
+    if key_prefix is None:
+        key_prefix = tm.hd5_key_guess()
     img = np.array(
-            tm.hd5_first_dataset_in_group(hd5, key_prefix),
-            dtype=np.float32,
+        tm.hd5_first_dataset_in_group(hd5, key_prefix),
+        dtype=np.float32,
     )
     img = img[...,[1]]
     return pad_or_crop_array_to_shape(
@@ -2687,15 +2683,32 @@ def _pad_crop_single_channel(tm, hd5, dependents={}):
         img,
     )
 
+def _pad_crop_single_channel_t1map_b2(tm, hd5, dependents={}):
+    if f'/{tm.path_prefix}/shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' in hd5:
+        key_prefix = f'/{tm.path_prefix}/shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2'
+    elif f'/{tm.path_prefix}/shmolli_192i_b2_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2' in hd5:
+        key_prefix = f'/{tm.path_prefix}/shmolli_192i_b2_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2'
+    else:
+        raise ValueError(f'Could not find T1 Map image for tensormap: {tm.name}')
+    return _pad_crop_single_channel(tm, hd5, key_prefix, dependents)
+
 t1map_b2 = TensorMap(
     'shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map',
     shape=(384, 384, 1),
     path_prefix='ukb_cardiac_mri',
     normalization=Standardize(mean=455.81, std=609.50),
+    tensor_from_file=_pad_crop_single_channel_t1map_b2,
+)
+
+t1map_pancreas = TensorMap(
+    'shmolli_192i_pancreas_t1map',
+    shape=(288, 384, 1),
+    path_prefix='ukb_pancreas_mri',
+    normalization=Standardize(mean=389.49, std=658.36),
     tensor_from_file=_pad_crop_single_channel,
 )
 
-def _segmented_t1map(tm, hd5, dependents={}):
+def _segmented_t1map_b2(tm, hd5, dependents={}):
     if f'{tm.path_prefix}/{tm.name}_1' in hd5:
         categorical_index_slice = get_tensor_at_first_date(hd5, tm.path_prefix, f'{tm.name}_1')
     elif f'{tm.path_prefix}/{tm.name}_2' in hd5:
@@ -2715,13 +2728,56 @@ def _segmented_t1map(tm, hd5, dependents={}):
     tensor[..., :] = pad_or_crop_array_to_shape(tensor[..., :].shape, categorical_one_hot)
     return tensor
 
+def _segmented_t1map_pancreas(tm, hd5, dependents={}):
+    if f'{tm.path_prefix}/{tm.name}' in hd5:
+        categorical_index_slice = get_tensor_at_first_date(hd5, tm.path_prefix, f'{tm.name}')
+    else:
+        raise ValueError(f'Could not find T1 Map segmentation for tensormap: {tm.name}')
+
+    # remove kidney label and merge body/background labels
+    orig_num_channels = len(tm.channel_map) + 3
+    categorical_one_hot = to_categorical(categorical_index_slice, orig_num_channels)
+    categorical_one_hot[..., 6] += (
+        categorical_one_hot[..., 11] +
+        categorical_one_hot[..., 12] +
+        categorical_one_hot[..., 13]
+    )
+    categorical_one_hot = np.delete(categorical_one_hot, [11, 12, 13], axis=-1)
+
+    # padding/cropping
+    tensor = np.zeros(tm.shape, dtype=np.float32)
+    tensor[..., :] = pad_or_crop_array_to_shape(tensor[..., :].shape, categorical_one_hot)
+    return tensor
+
 t1map_b2_segmentation = TensorMap(
     'b2s_t1map_kassir_annotated',
     interpretation=Interpretation.CATEGORICAL,
     shape=(384, 384, len(MRI_SAX_PAP_SEGMENTED_CHANNEL_MAP)),
     channel_map=MRI_SAX_PAP_SEGMENTED_CHANNEL_MAP,
     path_prefix='ukb_cardiac_mri',
-    tensor_from_file=_segmented_t1map,
+    tensor_from_file=_segmented_t1map_b2,
     loss=dice,
     metrics=['categorical_accuracy'] + per_class_dice(MRI_SAX_PAP_SEGMENTED_CHANNEL_MAP),
 )
+
+t1map_pancreas_segmentation_cce = TensorMap(
+    'shmolli_192i_pancreas_t1map_annotated_2',
+    interpretation=Interpretation.CATEGORICAL,
+    shape=(288, 384, len(MRI_PANCREAS_SEGMENTED_CHANNEL_MAP)),
+    channel_map=MRI_PANCREAS_SEGMENTED_CHANNEL_MAP,
+    path_prefix='ukb_pancreas_mri',
+    tensor_from_file=_segmented_t1map_pancreas,
+    loss='categorical_crossentropy',
+    metrics=['categorical_accuracy'] + per_class_dice(MRI_PANCREAS_SEGMENTED_CHANNEL_MAP),
+)
+
+t1map_pancreas_segmentation_dice = TensorMap(
+    'shmolli_192i_pancreas_t1map_annotated_2',
+    interpretation=Interpretation.CATEGORICAL,
+    shape=(288, 384, len(MRI_PANCREAS_SEGMENTED_CHANNEL_MAP)),
+    channel_map=MRI_PANCREAS_SEGMENTED_CHANNEL_MAP,
+    path_prefix='ukb_pancreas_mri',
+    tensor_from_file=_segmented_t1map_pancreas,
+    loss=dice,
+    metrics=['categorical_accuracy'] + per_class_dice(MRI_PANCREAS_SEGMENTED_CHANNEL_MAP),
+)

From e7cd9f06a0c9009b4911e0593581f5d01a18c3bb Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Fri, 15 Dec 2023 15:53:47 +0000
Subject: [PATCH 09/20] Add elements of Marcus's setup - L2 weight_decay and
 cosine decay learning rate schedule

---
 ml4h/arguments.py             | 2 +-
 ml4h/models/layer_wrappers.py | 8 +++++++-
 ml4h/models/legacy_models.py  | 7 ++++++-
 ml4h/optimizers.py            | 3 +++
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/ml4h/arguments.py b/ml4h/arguments.py
index 39cdf50b3..4f5dffb3f 100755
--- a/ml4h/arguments.py
+++ b/ml4h/arguments.py
@@ -263,7 +263,7 @@ def parse_args():
     )
     parser.add_argument('--balance_csvs', default=[], nargs='*', help='Balances batches with representation from sample IDs in this list of CSVs')
     parser.add_argument('--optimizer', default='radam', type=str, help='Optimizer for model training')
-    parser.add_argument('--learning_rate_schedule', default=None, type=str, choices=['triangular', 'triangular2'], help='Adjusts learning rate during training.')
+    parser.add_argument('--learning_rate_schedule', default=None, type=str, choices=['triangular', 'triangular2', 'cosine_decay'], help='Adjusts learning rate during training.')
     parser.add_argument('--anneal_rate', default=0., type=float, help='Annealing rate in epochs of loss terms during training')
     parser.add_argument('--anneal_shift', default=0., type=float, help='Annealing offset in epochs of loss terms during training')
     parser.add_argument('--anneal_max', default=2.0, type=float, help='Annealing maximum value')
diff --git a/ml4h/models/layer_wrappers.py b/ml4h/models/layer_wrappers.py
index 7ba1187c0..083765a03 100755
--- a/ml4h/models/layer_wrappers.py
+++ b/ml4h/models/layer_wrappers.py
@@ -25,6 +25,7 @@
 from tensorflow.keras.layers import MaxPooling2D, MaxPooling3D, Average, AveragePooling1D, AveragePooling2D, AveragePooling3D, Layer
 from tensorflow.keras.layers import SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Concatenate, Add
 from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalAveragePooling2D, GlobalAveragePooling3D
+from tensorflow.keras.regularizers import L1, L2
 
 
 Tensor = tf.Tensor
@@ -52,9 +53,14 @@
     # class name -> (dimension -> class)
     'spatial_dropout': {2: SpatialDropout1D, 3: SpatialDropout2D, 4: SpatialDropout3D},
     'dropout': defaultdict(lambda _: Dropout),
+    'l1': L1,
+    'l2': L2,
 }
 DENSE_REGULARIZATION_CLASSES = {
-    'dropout': Dropout,  # TODO: add l1, l2
+    'dropout': Dropout,
+    'dropout': Dropout,
+    'l1': L1,
+    'l2': L2,
 }
 
 
diff --git a/ml4h/models/legacy_models.py b/ml4h/models/legacy_models.py
index 3a9aab20c..2b975e1e3 100755
--- a/ml4h/models/legacy_models.py
+++ b/ml4h/models/legacy_models.py
@@ -30,6 +30,7 @@
 from tensorflow.keras.layers import SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Concatenate, Add
 from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalAveragePooling2D, GlobalAveragePooling3D
 from tensorflow.keras.layers.experimental.preprocessing import RandomRotation, RandomZoom, RandomContrast
+from tensorflow.keras.regularizers import L1, L2
 import tensorflow_probability as tfp
 
 from ml4h.metrics import get_metric_dict
@@ -79,9 +80,13 @@ class BottleneckType(Enum):
     # class name -> (dimension -> class)
     'spatial_dropout': {2: SpatialDropout1D, 3: SpatialDropout2D, 4: SpatialDropout3D},
     'dropout': defaultdict(lambda _: Dropout),
+    'l1': L1,
+    'l2': L2,
 }
 DENSE_REGULARIZATION_CLASSES = {
-    'dropout': Dropout,  # TODO: add l1, l2
+    'dropout': Dropout,
+    'l1': L1,
+    'l2': L2,
 }
 
 
diff --git a/ml4h/optimizers.py b/ml4h/optimizers.py
index c9e79d5f0..5dc130c2b 100755
--- a/ml4h/optimizers.py
+++ b/ml4h/optimizers.py
@@ -6,6 +6,7 @@
 from tensorflow.keras import backend as K
 from tensorflow.keras.models import Model
 from tensorflow_addons.optimizers import RectifiedAdam, TriangularCyclicalLearningRate, Triangular2CyclicalLearningRate
+from tensorflow.keras.optimizers.schedules import CosineDecay
 
 from ml4h.plots import plot_find_learning_rate
 from ml4h.tensor_generators import TensorGenerator
@@ -40,6 +41,8 @@ def _get_learning_rate_schedule(learning_rate: float, learning_rate_schedule: st
             initial_learning_rate=learning_rate / 5, maximal_learning_rate=learning_rate,
             step_size=steps_per_epoch * 5,
         )
+    if learning_rate_schedule == 'cosine_decay':
+        return CosineDecay(initial_learning_rate=learning_rate, decay_steps=steps_per_epoch)
     else:
         raise ValueError(f'Learning rate schedule "{learning_rate_schedule}" unknown.')
 

From 30507295d4f8319bfe1af8c927aa5a6b7249a6cd Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Fri, 15 Dec 2023 18:38:29 +0000
Subject: [PATCH 10/20] FIX: Fix bug when a generator has 0 ids

---
 ml4h/tensor_generators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml4h/tensor_generators.py b/ml4h/tensor_generators.py
index 7e698ddf8..f88617724 100755
--- a/ml4h/tensor_generators.py
+++ b/ml4h/tensor_generators.py
@@ -95,7 +95,7 @@ def __init__(
         :param paths: If weights is provided, paths should be a list of path lists the same length as weights
         """
         self.augment = augment
-        self.paths = sum(paths) if isinstance(paths[0], list) else paths
+        self.paths = sum(paths) if (len(paths) > 0 and isinstance(paths[0], list)) else paths
         self.run_on_main_thread = num_workers == 0
         self.q = None
         self.stats_q = None

From 4a951e92bdfb3a7528a3832b2c770735c513c012 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Fri, 15 Dec 2023 18:38:56 +0000
Subject: [PATCH 11/20] FIX: Fix bug when key_prefix is not given

---
 ml4h/tensormap/ukb/mri.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ml4h/tensormap/ukb/mri.py b/ml4h/tensormap/ukb/mri.py
index 35339266e..861f868ae 100755
--- a/ml4h/tensormap/ukb/mri.py
+++ b/ml4h/tensormap/ukb/mri.py
@@ -2670,7 +2670,7 @@ def _mdrk_projection_both_views_pretrained(tm, hd5, dependents={}):
     tensor_from_file=None,
 )
 
-def _pad_crop_single_channel(tm, hd5, key_prefix=None, dependents={}):
+def _pad_crop_single_channel(tm, hd5, dependents={}, key_prefix=None):
     if key_prefix is None:
         key_prefix = tm.hd5_key_guess()
     img = np.array(
@@ -2690,7 +2690,7 @@ def _pad_crop_single_channel_t1map_b2(tm, hd5, dependents={}):
         key_prefix = f'/{tm.path_prefix}/shmolli_192i_b2_sax_b2s_sax_b2s_sax_b2s_t1map/instance_2'
     else:
         raise ValueError(f'Could not find T1 Map image for tensormap: {tm.name}')
-    return _pad_crop_single_channel(tm, hd5, key_prefix, dependents)
+    return _pad_crop_single_channel(tm, hd5, dependents, key_prefix)
 
 t1map_b2 = TensorMap(
     'shmolli_192i_sax_b2s_sax_b2s_sax_b2s_t1map',

From e6684e46ea48833b22323e67e89f98b59079deb0 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Fri, 15 Dec 2023 18:49:43 +0000
Subject: [PATCH 12/20] ENH: Allow for no testing during model training

---
 ml4h/recipes.py | 62 +++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/ml4h/recipes.py b/ml4h/recipes.py
index 33ca54b7f..1058357ae 100755
--- a/ml4h/recipes.py
+++ b/ml4h/recipes.py
@@ -205,37 +205,39 @@ def train_multimodal_multitask(args):
     if merger:
         merger.save(f'{args.output_folder}{args.id}/merger.h5')
 
-    test_data, test_labels, test_paths = big_batch_from_minibatch_generator(generate_test, args.test_steps)
-    performance_metrics = _predict_and_evaluate(
-        model, test_data, test_labels, args.tensor_maps_in, args.tensor_maps_out, args.tensor_maps_protected,
-        args.batch_size, args.hidden_layer, os.path.join(args.output_folder, args.id + '/'), test_paths,
-        args.embed_visualization, args.alpha, args.dpi, args.plot_width, args.plot_height,
-    )
+    performance_metrics = {}
+    if args.test_steps > 0:
+        test_data, test_labels, test_paths = big_batch_from_minibatch_generator(generate_test, args.test_steps)
+        performance_metrics = _predict_and_evaluate(
+            model, test_data, test_labels, args.tensor_maps_in, args.tensor_maps_out, args.tensor_maps_protected,
+            args.batch_size, args.hidden_layer, os.path.join(args.output_folder, args.id + '/'), test_paths,
+            args.embed_visualization, args.alpha, args.dpi, args.plot_width, args.plot_height,
+        )
 
-    predictions_list = model.predict(test_data)
-    samples = min(args.test_steps * args.batch_size, 12)
-    out_path = os.path.join(args.output_folder, args.id, 'reconstructions/')
-    if len(args.tensor_maps_out) == 1:
-        predictions_list = [predictions_list]
-    predictions_dict = {name: pred for name, pred in zip(model.output_names, predictions_list)}
-    logging.info(f'Predictions and shapes are: {[(p, predictions_dict[p].shape) for p in predictions_dict]}')
-
-    for i, etm in enumerate(encoders):
-        embed = encoders[etm].predict(test_data[etm.input_name()])
-        if etm.output_name() in predictions_dict:
-            plot_reconstruction(etm, test_data[etm.input_name()], predictions_dict[etm.output_name()], out_path, test_paths, samples)
-        for dtm in decoders:
-            reconstruction = decoders[dtm].predict(embed)
-            logging.info(f'{dtm.name} has prediction shape: {reconstruction.shape} from embed shape: {embed.shape}')
-            my_out_path = os.path.join(out_path, f'decoding_{dtm.name}_from_{etm.name}/')
-            os.makedirs(os.path.dirname(my_out_path), exist_ok=True)
-            if dtm.axes() > 1:
-                plot_reconstruction(dtm, test_labels[dtm.output_name()], reconstruction, my_out_path, test_paths, samples)
-            else:
-                evaluate_predictions(
-                    dtm, reconstruction, test_labels[dtm.output_name()], {}, dtm.name, my_out_path,
-                    test_paths, dpi=args.dpi, width=args.plot_width, height=args.plot_height,
-                )
+        predictions_list = model.predict(test_data)
+        samples = min(args.test_steps * args.batch_size, 12)
+        out_path = os.path.join(args.output_folder, args.id, 'reconstructions/')
+        if len(args.tensor_maps_out) == 1:
+            predictions_list = [predictions_list]
+        predictions_dict = {name: pred for name, pred in zip(model.output_names, predictions_list)}
+        logging.info(f'Predictions and shapes are: {[(p, predictions_dict[p].shape) for p in predictions_dict]}')
+
+        for i, etm in enumerate(encoders):
+            embed = encoders[etm].predict(test_data[etm.input_name()])
+            if etm.output_name() in predictions_dict:
+                plot_reconstruction(etm, test_data[etm.input_name()], predictions_dict[etm.output_name()], out_path, test_paths, samples)
+            for dtm in decoders:
+                reconstruction = decoders[dtm].predict(embed)
+                logging.info(f'{dtm.name} has prediction shape: {reconstruction.shape} from embed shape: {embed.shape}')
+                my_out_path = os.path.join(out_path, f'decoding_{dtm.name}_from_{etm.name}/')
+                os.makedirs(os.path.dirname(my_out_path), exist_ok=True)
+                if dtm.axes() > 1:
+                    plot_reconstruction(dtm, test_labels[dtm.output_name()], reconstruction, my_out_path, test_paths, samples)
+                else:
+                    evaluate_predictions(
+                        dtm, reconstruction, test_labels[dtm.output_name()], {}, dtm.name, my_out_path,
+                        test_paths, dpi=args.dpi, width=args.plot_width, height=args.plot_height,
+                    )
     return performance_metrics
 
 

From 78839dda42a9deeca72f603891f968950e1060a0 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Fri, 8 Dec 2023 17:10:12 +0000
Subject: [PATCH 13/20] FIX: Fix typo

---
 ml4h/explorations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ml4h/explorations.py b/ml4h/explorations.py
index 5da14d5d9..612aaa0c0 100755
--- a/ml4h/explorations.py
+++ b/ml4h/explorations.py
@@ -896,7 +896,7 @@ def infer_stats_from_segmented_regions(args):
     if args.analyze_ground_truth:
         _scatter_plots_from_segmented_region_stats(
             inference_tsv_true, inference_tsv_pred, args.structures_to_analyze,
-            args.output_folder, args.id, tm_in.input_name(), args.output_name,
+            args.output_folder, args.id, tm_in.input_name(), tm_out.output_name(),
         )
 
 def _softmax(x):

From 6e70e83fe0cdaa4e7401f3bf7f0f199b45197def Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Fri, 8 Dec 2023 17:29:55 +0000
Subject: [PATCH 14/20] FIX: Fix parser for boolean arguments

---
 ml4h/arguments.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ml4h/arguments.py b/ml4h/arguments.py
index 4f5dffb3f..ef40373e5 100755
--- a/ml4h/arguments.py
+++ b/ml4h/arguments.py
@@ -385,7 +385,9 @@ def parse_args():
     )
 
     # Arguments for explorations/infer_stats_from_segmented_regions
-    parser.add_argument('--analyze_ground_truth', default=True, help='Whether or not to filter by images with ground truth segmentations, for comparison')
+    parser.add_argument('--analyze_ground_truth', action='store_true', help='Filter by images with ground truth segmentations, for comparison')
+    parser.add_argument('--no_analyze_ground_truth', dest='analyze_ground_truth', action='store_false', help='Do not filter by images with ground truth segmentations, for comparison')
+    parser.set_defaults(analyze_ground_truth=True)
     parser.add_argument('--structures_to_analyze', nargs='*', default=[], help='Structure names to include in the .tsv files and scatter plots')
     parser.add_argument('--erosion_radius', default=1, type=int, help='Radius of the unit disk structuring element for erosion preprocessing')
     parser.add_argument('--intensity_thresh', type=float, help='Threshold value for preprocessing')

From 4b8654377e4dde3eed63dfaef69d22559fa2c7b2 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Tue, 19 Dec 2023 20:13:40 -0500
Subject: [PATCH 15/20] STYLE: Remove unneeded comment

---
 ml4h/explorations.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ml4h/explorations.py b/ml4h/explorations.py
index 612aaa0c0..1787014da 100755
--- a/ml4h/explorations.py
+++ b/ml4h/explorations.py
@@ -798,7 +798,6 @@ def infer_stats_from_segmented_regions(args):
     assert(tm_in.shape[-1] == 1, 'no support here for stats on multiple input channels')
 
     # don't filter datasets for ground truth segmentations if we want to run inference on everything
-    # TODO HELP - this isn't giving me all 56K anymore
     if not args.analyze_ground_truth:
         args.output_tensors = []
         args.tensor_maps_out = []

From 05c4f77b410f2efe7f9d3dcda5939de36dc7b857 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Wed, 20 Dec 2023 02:19:59 +0000
Subject: [PATCH 16/20] WIP

---
 ml4h/arguments.py    |  2 ++
 ml4h/explorations.py | 38 +++++++++++++++++++++++++++-----------
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/ml4h/arguments.py b/ml4h/arguments.py
index ef40373e5..2eae88e5c 100755
--- a/ml4h/arguments.py
+++ b/ml4h/arguments.py
@@ -391,6 +391,8 @@ def parse_args():
     parser.add_argument('--structures_to_analyze', nargs='*', default=[], help='Structure names to include in the .tsv files and scatter plots')
     parser.add_argument('--erosion_radius', default=1, type=int, help='Radius of the unit disk structuring element for erosion preprocessing')
     parser.add_argument('--intensity_thresh', type=float, help='Threshold value for preprocessing')
+    parser.add_argument('--intensity_thresh_perc', type=float, help='Threshold percentile for preprocessing, between 0 and 100 inclusive')
+    parser.add_argument('--intensity_thresh_k_means', nargs='*', default=[], type=int, help='Preprocessing using k-means specified as two numbers, the first is the number of clusters and the second is the cluster index to keep')
     parser.add_argument('--intensity_thresh_in_structures', nargs='*', default=[], help='Structure names whose pixels should be replaced if the images has intensity above the threshold')
     parser.add_argument('--intensity_thresh_out_structure', help='Replacement structure name')
 
diff --git a/ml4h/explorations.py b/ml4h/explorations.py
index 1787014da..662679990 100755
--- a/ml4h/explorations.py
+++ b/ml4h/explorations.py
@@ -20,6 +20,7 @@
 import pandas as pd
 import multiprocessing as mp
 from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans
 from tensorflow.keras.models import Model
 
 
@@ -719,13 +720,26 @@ def _get_csv_row(sample_id, means, medians, stds, date):
     csv_row = [sample_id] + res[0].astype('str').tolist() + [date]
     return csv_row
 
-def _thresh_labels_above(y, img, intensity_thresh, in_labels, out_label, nb_orig_channels):
+def _thresh_labels_above(y, img, intensity_thresh, intensity_thresh_perc, in_labels, out_label, nb_orig_channels):
     y = np.argmax(y, axis=-1)[..., np.newaxis]
-    y[np.logical_and(img >= intensity_thresh, np.isin(y, in_labels))] = out_label
+    if intensity_thresh:
+        img_intensity_thresh = intensity_thresh
+    elif intensity_thresh_perc:
+        img_intensity_thresh = np.percentile(img, intensity_thresh_perc)
+    y[np.logical_and(img >= img_intensity_thresh, np.isin(y, in_labels))] = out_label
     y = y[..., 0]
     y = _to_categorical(y, nb_orig_channels)
     return y
 
+def _intensity_thresh_k_means(y, img, intensity_thresh_k_means):
+    X = img[y==1][...,np.newaxis]
+    if X.size > 1:
+        kmeans = KMeans(n_clusters=intensity_thresh_k_means[0], random_state=0, n_init="auto").fit(X)
+        labels = kmeans.predict(img.flatten()[...,np.newaxis])
+        labels = np.reshape(labels, img.shape)
+        y[np.logical_and(labels==intensity_thresh_k_means[1], y==1)] = 0
+    return y
+
 def _scatter_plots_from_segmented_region_stats(
     inference_tsv_true, inference_tsv_pred, structures_to_analyze,
     output_folder, id, input_name, output_name,
@@ -759,13 +773,9 @@ def _scatter_plots_from_segmented_region_stats(
             title = col.replace('_', ' ')
             ax.set_xlabel(f'{title} T1 Time (ms) - Manual Segmentation')
             ax.set_ylabel(f'{title} T1 Time (ms) - Model Segmentation')
-            if i == 'all':
-                min_value = -50
-                max_value = 1300
-            elif i == 'filter_outliers':
-                min_value, max_value = plot_data.min(), plot_data.max()
-                min_value = min([min_value['true'], min_value['pred']]) - 100
-                max_value = min([max_value['true'], max_value['pred']]) + 100
+            min_value, max_value = plot_data.min(), plot_data.max()
+            min_value = min([min_value['true'], min_value['pred']]) - 100
+            max_value = min([max_value['true'], max_value['pred']]) + 100
             ax.set_xlim([min_value, max_value])
             ax.set_ylim([min_value, max_value])
             res = stats.pearsonr(plot_data['true'], plot_data['pred'])
@@ -819,6 +829,8 @@ def infer_stats_from_segmented_regions(args):
     # Setup for intensity thresholding
     do_intensity_thresh = args.intensity_thresh_in_structures and args.intensity_thresh_out_structure
     if do_intensity_thresh:
+        assert (not (args.intensity_thresh and args.intensity_thresh_perc))
+        assert (not (args.intensity_thresh_k_means and len(args.intensity_thresh_in_structures) > 1))
         intensity_thresh_in_channels = [tm_out.channel_map[k] for k in args.intensity_thresh_in_structures]
         intensity_thresh_out_channel = tm_out.channel_map[args.intensity_thresh_out_structure]
 
@@ -869,19 +881,23 @@ def infer_stats_from_segmented_regions(args):
 
             if args.analyze_ground_truth:
                 if do_intensity_thresh:
-                    y_true = _thresh_labels_above(y_true, img, args.intensity_thresh, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels)
+                    y_true = _thresh_labels_above(y_true, img, args.intensity_thresh, args.intensity_thresh_perc, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels)
                 y_true = np.delete(y_true, bad_channels, axis=-1)
                 if args.erosion_radius > 0:
                     y_true = binary_erosion(y_true, structure).astype(y_true.dtype)
+                if args.intensity_thresh_k_means:
+                    y_true = _intensity_thresh_k_means(y_true, img, args.intensity_thresh_k_means)
                 means_true, medians_true, stds_true = _compute_masked_stats(rescaled_img, y_true, nb_good_channels)
                 csv_row_true = _get_csv_row(sample_id, means_true, medians_true, stds_true, date)
                 inference_writer_true.writerow(csv_row_true)
 
             if do_intensity_thresh:
-                y_pred = _thresh_labels_above(y_pred, img, args.intensity_thresh, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels)
+                y_pred = _thresh_labels_above(y_pred, img, args.intensity_thresh, args.intensity_thresh_perc, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels)
             y_pred = np.delete(y_pred, bad_channels, axis=-1)
             if args.erosion_radius > 0:
                 y_pred = binary_erosion(y_pred, structure).astype(y_pred.dtype)
+            if args.intensity_thresh_k_means:
+                y_pred = _intensity_thresh_k_means(y_pred, img, args.intensity_thresh_k_means)
             means_pred, medians_pred, stds_pred = _compute_masked_stats(rescaled_img, y_pred, nb_good_channels)
             csv_row_pred = _get_csv_row(sample_id, means_pred, medians_pred, stds_pred, date)
             inference_writer_pred.writerow(csv_row_pred)

From ca2e89b488a9c981042b34e3286c79847ad2229a Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Wed, 10 Jan 2024 21:00:25 +0000
Subject: [PATCH 17/20] FIX error (which kills a thread and prevents subsequent
 pngs from being written) if the image size is wrong

---
 ml4h/tensorize/tensor_writer_ukbb.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py
index 479641012..acf97b7bc 100755
--- a/ml4h/tensorize/tensor_writer_ukbb.py
+++ b/ml4h/tensorize/tensor_writer_ukbb.py
@@ -219,6 +219,9 @@ def write_tensors_from_dicom_pngs(
         except FileNotFoundError:
             logging.warning(f'Could not find file: {os.path.join(png_path, dicom_file + png_postfix)}')
             stats['File not found error'] += 1
+        except ValueError:
+            logging.warning(f'Could not convert file: {os.path.join(png_path, dicom_file + png_postfix)}')
+            stats['Value error'] += 1
     for k in stats:
         if sample_header in k and stats[k] == 50:
             continue

From 05c5975875d18783a8e8347e2a71d5bfbbb5ad49 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Wed, 10 Jan 2024 21:43:23 +0000
Subject: [PATCH 18/20] Tensorize can create empty tensors if there are no good
 series, making you think it's working when it isn't. At least give a warning

---
 ml4h/tensorize/tensor_writer_ukbb.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py
index acf97b7bc..c4bb29418 100755
--- a/ml4h/tensorize/tensor_writer_ukbb.py
+++ b/ml4h/tensorize/tensor_writer_ukbb.py
@@ -454,6 +454,8 @@ def _write_tensors_from_dicoms(
             dxa_number = dicom.split('.')[-4]
             name = f'dxa_{series_num}_{dxa_number}'
             create_tensor_in_hd5(hd5, f'ukb_dxa/', name, d.pixel_array, stats)
+        else:
+            stats[f'Could not process series {series}'] += 1
 
         if series in MRI_LIVER_IDEAL_PROTOCOL:
             min_ideal_series = min(min_ideal_series, int(d.SeriesNumber))

From 8eaa344bafa36d9ce70710d13fbc9771e87ce5be Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Thu, 11 Jan 2024 10:15:33 -0500
Subject: [PATCH 19/20] Don't commit code to interpet this specific
 manifest_tsv file

---
 ml4h/tensorize/tensor_writer_ukbb.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/ml4h/tensorize/tensor_writer_ukbb.py b/ml4h/tensorize/tensor_writer_ukbb.py
index c4bb29418..fc56b3f71 100755
--- a/ml4h/tensorize/tensor_writer_ukbb.py
+++ b/ml4h/tensorize/tensor_writer_ukbb.py
@@ -173,11 +173,11 @@ def write_tensors(
 def write_tensors_from_dicom_pngs(
     tensors, png_path, manifest_tsv, series, min_sample_id, max_sample_id, x=256, y=256,
     sample_header='sample_id', dicom_header='dicom_file',
-    instance_header='instance', png_postfix='.png.mask.png',
-    path_prefix='ukb_pancreas_mri',
+    instance_header='instance_number', png_postfix='.png.mask.png',
+    path_prefix='ukb_cardiac_mri',
 ):
     stats = Counter()
-    reader = csv.reader(open(manifest_tsv), delimiter=' ')
+    reader = csv.reader(open(manifest_tsv), delimiter='\t')
     header = next(reader)
     logging.info(f"DICOM Manifest Header is:{header}")
     instance_index = header.index(instance_header)
@@ -188,16 +188,7 @@ def write_tensors_from_dicom_pngs(
         if not min_sample_id <= int(sample_id) < max_sample_id:
             continue
         stats[sample_header + '_' + sample_id] += 1
-        if 'train' in png_path:
-            dicom_file = row[dicom_index]
-        elif 'valid' in png_path:
-            search_file = os.path.join(png_path, f'*_{sample_id}_*')
-            dicom_file = glob.glob(search_file)
-            if len(dicom_file) > 0:
-                assert (len(dicom_file) == 1)
-                dicom_file = dicom_file[0].split('.')[0]
-            else:
-                dicom_file = search_file
+        dicom_file = row[dicom_index]
         try:
             png = imageio.imread(os.path.join(png_path, dicom_file + png_postfix))
             full_tensor = np.zeros((x, y), dtype=np.float32)

From fa9303b0bd28ef36d229fc58448fa24efcdbd4f6 Mon Sep 17 00:00:00 2001
From: Danielle Pace <pace@broadinstitute.org>
Date: Fri, 12 Jan 2024 11:11:36 -0500
Subject: [PATCH 20/20] STYLE: Rename intensity_thresh_perc ->
 intensity_thresh_percentile

---
 ml4h/arguments.py    |  2 +-
 ml4h/explorations.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/ml4h/arguments.py b/ml4h/arguments.py
index 2eae88e5c..88cca51e7 100755
--- a/ml4h/arguments.py
+++ b/ml4h/arguments.py
@@ -391,7 +391,7 @@ def parse_args():
     parser.add_argument('--structures_to_analyze', nargs='*', default=[], help='Structure names to include in the .tsv files and scatter plots')
     parser.add_argument('--erosion_radius', default=1, type=int, help='Radius of the unit disk structuring element for erosion preprocessing')
     parser.add_argument('--intensity_thresh', type=float, help='Threshold value for preprocessing')
-    parser.add_argument('--intensity_thresh_perc', type=float, help='Threshold percentile for preprocessing, between 0 and 100 inclusive')
+    parser.add_argument('--intensity_thresh_percentile', type=float, help='Threshold percentile for preprocessing, between 0 and 100 inclusive')
     parser.add_argument('--intensity_thresh_k_means', nargs='*', default=[], type=int, help='Preprocessing using k-means specified as two numbers, the first is the number of clusters and the second is the cluster index to keep')
     parser.add_argument('--intensity_thresh_in_structures', nargs='*', default=[], help='Structure names whose pixels should be replaced if the images has intensity above the threshold')
     parser.add_argument('--intensity_thresh_out_structure', help='Replacement structure name')
diff --git a/ml4h/explorations.py b/ml4h/explorations.py
index 662679990..924a1e5b5 100755
--- a/ml4h/explorations.py
+++ b/ml4h/explorations.py
@@ -720,12 +720,12 @@ def _get_csv_row(sample_id, means, medians, stds, date):
     csv_row = [sample_id] + res[0].astype('str').tolist() + [date]
     return csv_row
 
-def _thresh_labels_above(y, img, intensity_thresh, intensity_thresh_perc, in_labels, out_label, nb_orig_channels):
+def _thresh_labels_above(y, img, intensity_thresh, intensity_thresh_percentile, in_labels, out_label, nb_orig_channels):
     y = np.argmax(y, axis=-1)[..., np.newaxis]
     if intensity_thresh:
         img_intensity_thresh = intensity_thresh
-    elif intensity_thresh_perc:
-        img_intensity_thresh = np.percentile(img, intensity_thresh_perc)
+    elif intensity_thresh_percentile:
+        img_intensity_thresh = np.percentile(img, intensity_thresh_percentile)
     y[np.logical_and(img >= img_intensity_thresh, np.isin(y, in_labels))] = out_label
     y = y[..., 0]
     y = _to_categorical(y, nb_orig_channels)
@@ -829,7 +829,7 @@ def infer_stats_from_segmented_regions(args):
     # Setup for intensity thresholding
     do_intensity_thresh = args.intensity_thresh_in_structures and args.intensity_thresh_out_structure
     if do_intensity_thresh:
-        assert (not (args.intensity_thresh and args.intensity_thresh_perc))
+        assert (not (args.intensity_thresh and args.intensity_thresh_percentile))
         assert (not (args.intensity_thresh_k_means and len(args.intensity_thresh_in_structures) > 1))
         intensity_thresh_in_channels = [tm_out.channel_map[k] for k in args.intensity_thresh_in_structures]
         intensity_thresh_out_channel = tm_out.channel_map[args.intensity_thresh_out_structure]
@@ -881,7 +881,7 @@ def infer_stats_from_segmented_regions(args):
 
             if args.analyze_ground_truth:
                 if do_intensity_thresh:
-                    y_true = _thresh_labels_above(y_true, img, args.intensity_thresh, args.intensity_thresh_perc, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels)
+                    y_true = _thresh_labels_above(y_true, img, args.intensity_thresh, args.intensity_thresh_percentile, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels)
                 y_true = np.delete(y_true, bad_channels, axis=-1)
                 if args.erosion_radius > 0:
                     y_true = binary_erosion(y_true, structure).astype(y_true.dtype)
@@ -892,7 +892,7 @@ def infer_stats_from_segmented_regions(args):
                 inference_writer_true.writerow(csv_row_true)
 
             if do_intensity_thresh:
-                y_pred = _thresh_labels_above(y_pred, img, args.intensity_thresh, args.intensity_thresh_perc, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels)
+                y_pred = _thresh_labels_above(y_pred, img, args.intensity_thresh, args.intensity_thresh_percentile, intensity_thresh_in_channels, intensity_thresh_out_channel, nb_orig_channels)
             y_pred = np.delete(y_pred, bad_channels, axis=-1)
             if args.erosion_radius > 0:
                 y_pred = binary_erosion(y_pred, structure).astype(y_pred.dtype)