Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2583 Add DatasetSummary #2616

Merged
merged 13 commits into from
Jul 30, 2021
4 changes: 4 additions & 0 deletions docs/source/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ DistributedWeightedRandomSampler
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: monai.data.DistributedWeightedRandomSampler

DatasetCalculator
~~~~~~~~~~~~~~~~~
.. autoclass:: monai.data.DatasetCalculator

Decathlon Datalist
~~~~~~~~~~~~~~~~~~
.. autofunction:: monai.data.load_decathlon_datalist
Expand Down
1 change: 1 addition & 0 deletions monai/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
SmartCacheDataset,
ZipDataset,
)
from .dataset_calculator import DatasetCalculator
from .decathlon_datalist import load_decathlon_datalist, load_decathlon_properties
from .grid_dataset import GridPatchDataset, PatchDataset, PatchIter
from .image_dataset import ImageDataset
Expand Down
128 changes: 128 additions & 0 deletions monai/data/dataset_calculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Copyright 2020 - 2021 MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
import multiprocessing as mp
from typing import Dict, Sequence

import numpy as np

from monai.transforms import LoadImaged


class DatasetCalculator:
"""
This class provides a way to calculate a reasonable output voxel spacing according to
the input dataset. The achieved values can used to resample the input in 3d segmentation tasks
(like using as the `pixel` parameter in `monai.transforms.Spacingd`).
In addition, it also supports to count the mean, std, min and max intensities of the input,
and these statistics are helpful for image normalization
(like using in `monai.transforms.ScaleIntensityRanged` and `monai.transforms.NormalizeIntensityd`).

The algorithm for calculation refers to:
`Automated Design of Deep Learning Methods for Biomedical Image Segmentation <https://arxiv.org/abs/1904.08128>`_.

"""

def __init__(
self,
datalist: Sequence[Dict],
image_key: str = "image",
label_key: str = "label",
meta_key_postfix: str = "meta_dict",
num_processes: int = 1,
):
"""
Args:
datalist: a list that contains the path of all images and labels. The list is
consisted with dictionaries, and each dictionary contains the image and label
path of one sample. For datasets that have Decathlon format, datalist can be
achieved by calling `monai.data.load_decathlon_datalist`.
image_key: the key name of images. Defaults to `image`.
label_key: the key name of labels. Defaults to `label`.
meta_key_postfix: for nifti images, use `{image_key}_{meta_key_postfix}` to
store the metadata of images.
num_workers: the maximum number of processes can be used in data loading.

"""

self.datalist = datalist
self.image_key = image_key
self.label_key = label_key
self.meta_key_postfix = meta_key_postfix
self.num_processes = num_processes
self.loader = LoadImaged(keys=[image_key, label_key], meta_key_postfix=meta_key_postfix)
yiheng-wang-nv marked this conversation as resolved.
Show resolved Hide resolved

def _run_parallel(self, function):
"""
Parallelly running the function for all data in the datalist.

"""
with mp.Pool(processes=self.num_processes) as pool:
result = pool.map(function, self.datalist)
return result

def _load_spacing(self, path_dict: Dict):
"""
Load spacing from a data's dictionary. Assume that the original image file has `pixdim`
in its metadata.

"""
data = self.loader(path_dict)
meta_key = "{}_{}".format(self.image_key, self.meta_key_postfix)
spacing = data[meta_key]["pixdim"][1:4].tolist()

return spacing

def _get_target_spacing(self, anisotropic_threshold: int = 3, percentile: float = 10.0):
"""
Calculate the target spacing according to all spacings.
If the target spacing is very anisotropic,
decrease the spacing value of the maximum axis according to percentile.

"""
spacing = self._run_parallel(self._load_spacing)
spacing = np.array(spacing)
target_spacing = np.median(spacing, axis=0)
if max(target_spacing) / min(target_spacing) >= anisotropic_threshold:
yiheng-wang-nv marked this conversation as resolved.
Show resolved Hide resolved
largest_axis = np.argmax(target_spacing)
target_spacing[largest_axis] = np.percentile(spacing[:, largest_axis], percentile)

output = list(target_spacing)
output = [round(value, 2) for value in output]

return tuple(output)

def _load_intensity(self, path_dict: Dict):
"""
Load intensity from a data's dictionary.

"""
data = self.loader(path_dict)
image = data[self.image_key]
yiheng-wang-nv marked this conversation as resolved.
Show resolved Hide resolved
foreground_idx = np.where(data[self.label_key] > 0)

return image[foreground_idx].tolist()

def _get_intensity_stats(self, lower: float = 0.5, upper: float = 99.5):
"""
Calculate min, max, mean and std of all intensities. The minimal and maximum
values will be processed according to the provided percentiles.

"""
intensity = self._run_parallel(self._load_intensity)
intensity = np.array(list(itertools.chain.from_iterable(intensity)))
yiheng-wang-nv marked this conversation as resolved.
Show resolved Hide resolved
min_value, max_value = np.percentile(intensity, [lower, upper])
mean_value, std_value = np.mean(intensity), np.std(intensity)
output = [min_value, max_value, mean_value, std_value]
output = [round(value, 2) for value in output]

return tuple(output)
yiheng-wang-nv marked this conversation as resolved.
Show resolved Hide resolved
1 change: 1 addition & 0 deletions tests/min_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def run_testsuit():
"test_handler_metrics_saver",
"test_handler_metrics_saver_dist",
"test_handler_classification_saver_dist",
"test_dataset_calculator",
"test_deepgrow_transforms",
"test_deepgrow_interaction",
"test_deepgrow_dataset",
Expand Down
50 changes: 50 additions & 0 deletions tests/test_dataset_calculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2020 - 2021 MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import os
import tempfile
import unittest

import nibabel as nib
import numpy as np

from monai.data import DatasetCalculator, create_test_image_3d
from monai.utils import set_determinism


class TestDatasetCalculator(unittest.TestCase):
def test_spacing_intensity(self):
set_determinism(seed=0)
with tempfile.TemporaryDirectory() as tempdir:

for i in range(5):
im, seg = create_test_image_3d(32, 32, 32, num_seg_classes=1, num_objs=3, rad_max=6, channel_dim=-1)
n = nib.Nifti1Image(im, np.eye(4))
nib.save(n, os.path.join(tempdir, f"img{i:d}.nii.gz"))
n = nib.Nifti1Image(seg, np.eye(4))
nib.save(n, os.path.join(tempdir, f"seg{i:d}.nii.gz"))

train_images = sorted(glob.glob(os.path.join(tempdir, "img*.nii.gz")))
train_labels = sorted(glob.glob(os.path.join(tempdir, "seg*.nii.gz")))
data_dicts = [
{"image": image_name, "label": label_name} for image_name, label_name in zip(train_images, train_labels)
]

calculator = DatasetCalculator(data_dicts, num_processes=2)
target_spacing = calculator._get_target_spacing(anisotropic_threshold=3, percentile=10.0)
self.assertEqual(target_spacing, (1.0, 1.0, 1.0))
intensity_stats = calculator._get_intensity_stats(lower=0.5, upper=99.5)
self.assertEqual(intensity_stats, (0.56, 1.0, 0.89, 0.13))


if __name__ == "__main__":
unittest.main()