From f95e1002fe57314403d36032569f7168f8e766c6 Mon Sep 17 00:00:00 2001 From: Gigon Bae Date: Mon, 30 Oct 2023 16:14:33 -0700 Subject: [PATCH] Fix iterator-related memory issues (#620) MERGE after https://github.com/rapidsai/cucim/pull/618 ### Fix memory corruption issue in ThreadBatchDataLoader Fix #592 - Return a shared pointer of CuImageIterator, taking ownership of CuImageIterator instance, instead of returning a copy of CuImageIterator - Do not create new instance for CuImageIterator when `__iter__` is called. Instead, return self. - Wait until all threads are finished in ThreadBatchDataLoader destructor ### Fix memory leak in ThreadBatchDataLoader Fix #598 - Throw StopIteration exception early - Do not set nullptr for `raster_data_` in `ThreadBatchDataLoader::next_data()` - Setting nullptr prevents freeing memory in the destructor of ThreadBatchDataLoader ### Fix fatal errors for cache object - GIL is not acquired when using Python object. ### Add iterator-related test cases for memory issue Authors: - Gigon Bae (https://github.com/gigony) Approvers: - Gregory Lee (https://github.com/grlee77) - https://github.com/jakirkham URL: https://github.com/rapidsai/cucim/pull/620 --- cpp/src/loader/thread_batch_data_loader.cpp | 14 ++-- .../clara/test_read_region_memory_usage.py | 72 ++++++++++++++++++- .../tests/unit/clara/test_tiff_read_region.py | 39 +++++++++- python/pybind11/cache/cache_py.cpp | 9 ++- python/pybind11/cucim_py.cpp | 22 +++--- 5 files changed, 131 insertions(+), 25 deletions(-) diff --git a/cpp/src/loader/thread_batch_data_loader.cpp b/cpp/src/loader/thread_batch_data_loader.cpp index 89de8e677..9aff2d88c 100644 --- a/cpp/src/loader/thread_batch_data_loader.cpp +++ b/cpp/src/loader/thread_batch_data_loader.cpp @@ -1,6 +1,6 @@ /* * Apache License, Version 2.0 - * Copyright 2021 NVIDIA Corporation + * Copyright 2021-2023 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,6 +86,9 @@ ThreadBatchDataLoader::ThreadBatchDataLoader(LoadFunc load_func, ThreadBatchDataLoader::~ThreadBatchDataLoader() { + // Wait until all tasks are done. + while (wait_batch() > 0); + cucim::io::DeviceType device_type = out_device_.type(); for (auto& raster_ptr : raster_data_) { @@ -207,7 +210,8 @@ uint8_t* ThreadBatchDataLoader::next_data() { if (num_workers_ == 0) // (location_len == 1 && batch_size == 1) { - // If it reads entire image with multi threads (using loader), release raster memory from batch data loader. + // If it reads entire image with multi threads (using loader), release raster memory from batch data loader + // by setting it to nullptr so that it will not be freed by ~ThreadBatchDataLoader (destructor). uint8_t* batch_raster_ptr = raster_data_[0]; raster_data_[0] = nullptr; return batch_raster_ptr; @@ -215,11 +219,7 @@ uint8_t* ThreadBatchDataLoader::next_data() if (processed_batch_count_ * batch_size_ >= location_len_) { - // Remove buffer items that are no longer needed. - for (size_t i = 0; i < buffer_item_len_; ++i) - { - raster_data_[i] = nullptr; - } + // If all batches are processed, return nullptr. return nullptr; } diff --git a/python/cucim/tests/performance/clara/test_read_region_memory_usage.py b/python/cucim/tests/performance/clara/test_read_region_memory_usage.py index aba340eea..5597e0866 100644 --- a/python/cucim/tests/performance/clara/test_read_region_memory_usage.py +++ b/python/cucim/tests/performance/clara/test_read_region_memory_usage.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -95,3 +95,73 @@ def test_read_random_region_cpu_memleak(testimg_tiff_stripe_4096x4096_256): # Memory usage difference should be smaller than (iteration) * 100 bytes assert mem_usage_history[-1] - mem_usage_history[1] < iteration * 100 + + +def test_tiff_iterator(testimg_tiff_stripe_4096x4096_256): + """Verify that the iterator of read_region does not cause a memory leak. + See issue gh-598: https://github.com/rapidsai/cucim/issues/598 + """ + + import numpy as np + import random + import os + import psutil + + slide_path = testimg_tiff_stripe_4096x4096_256 + + level = 0 + size = (300, 300) + batch_size = 64 + num_workers = 16 + + # This may be a non-integer number, to have the last iteration return a + # number of patches < batch_size + num_batches = 4.3 + locations = [ + (random.randint(0, 1000), random.randint(2000, 3000)) + for _ in range(int(batch_size * num_batches)) + ] + print( + f"Number of locations: {len(locations)}, batch size: {batch_size}, " + "number of workers: {num_workers}" + ) + + def get_total_mem_usage(): + current_process = psutil.Process(os.getpid()) + mem = current_process.memory_info().rss + for child in current_process.children(recursive=True): + mem += child.memory_info().rss + return mem + + with open_image_cucim(slide_path) as slide: + start_mem = None + mem_usage_history = [] + for i in range(101): + gen = slide.read_region( + locations, + size, + level, + batch_size=batch_size, + num_workers=num_workers, + ) + for x in gen: + _ = np.asarray(x) + + if i % 10 == 0: + if start_mem is None: + start_mem = get_total_mem_usage() + print( + f"starting with: {(start_mem) // (1024 * 1024)}" + " MB of memory consumption" + ) + else: + memory_increase = (get_total_mem_usage() - start_mem) // ( + 1024 * 1024 + ) + mem_usage_history.append(memory_increase) + print( + f"mem increase (iteration: {i:3d}): " + "{memory_increase:4d} MB" + ) + # Memory usage difference should be less than 20MB + assert mem_usage_history[-1] - mem_usage_history[1] < 20 diff --git a/python/cucim/tests/unit/clara/test_tiff_read_region.py b/python/cucim/tests/unit/clara/test_tiff_read_region.py index 9fafe431c..5b122fb50 100644 --- a/python/cucim/tests/unit/clara/test_tiff_read_region.py +++ b/python/cucim/tests/unit/clara/test_tiff_read_region.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -196,3 +196,40 @@ def test_cuda_array_interface_support(testimg_tiff_stripe_32x24_16_jpeg): assert array_interface["version"] == 3 assert array_interface["mask"] is None assert array_interface["stream"] == 1 + + +def test_tiff_iterator(testimg_tiff_stripe_4096x4096_256): + """Test that the iterator of read_region works as expected. + See issue gh-592: https://github.com/rapidsai/cucim/issues/592 + """ + + level = 0 + size = (256, 256) + + with open_image_cucim(testimg_tiff_stripe_4096x4096_256) as slide: + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + + with open_image_cucim(testimg_tiff_stripe_4096x4096_256) as slide: + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + _ = next(x) + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + + with open_image_cucim(testimg_tiff_stripe_4096x4096_256) as slide: + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + _ = next(x) + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + _ = next(x) + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + _ = next(x) + x = slide.read_region( + [(0, 0), (0, 0), (0, 0)], size, level, num_workers=1 + ) + _ = next(x) + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + _ = next(x) + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + _ = next(x) + x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1) + _ = next(x) diff --git a/python/pybind11/cache/cache_py.cpp b/python/pybind11/cache/cache_py.cpp index 6b176e93e..b8f09f8ed 100644 --- a/python/pybind11/cache/cache_py.cpp +++ b/python/pybind11/cache/cache_py.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,13 +53,13 @@ void init_cache(py::module& cache) py::call_guard()) .def_property("free_memory", &ImageCache::free_memory, nullptr, doc::ImageCache::doc_free_memory, py::call_guard()) - .def("record", &py_record, doc::ImageCache::doc_record, py::call_guard(), // + .def("record", &py_record, doc::ImageCache::doc_record, // py::arg("value") = py::none()) .def_property("hit_count", &ImageCache::hit_count, nullptr, doc::ImageCache::doc_hit_count, py::call_guard()) .def_property("miss_count", &ImageCache::miss_count, nullptr, doc::ImageCache::doc_miss_count, py::call_guard()) - .def("reserve", &py_image_cache_reserve, doc::ImageCache::doc_reserve, py::call_guard(), // + .def("reserve", &py_image_cache_reserve, doc::ImageCache::doc_reserve, // py::arg("memory_capacity")); cache.def("preferred_memory_capacity", &py_preferred_memory_capacity, doc::doc_preferred_memory_capacity, // @@ -67,8 +67,7 @@ void init_cache(py::module& cache) py::arg("image_size") = std::nullopt, // py::arg("tile_size") = std::nullopt, // py::arg("patch_size") = std::nullopt, // - py::arg("bytes_per_pixel") = 3, // - py::call_guard()); + py::arg("bytes_per_pixel") = 3); } bool py_record(ImageCache& cache, py::object value) diff --git a/python/pybind11/cucim_py.cpp b/python/pybind11/cucim_py.cpp index c3c88bb34..62f041e2d 100644 --- a/python/pybind11/cucim_py.cpp +++ b/python/pybind11/cucim_py.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -107,7 +107,7 @@ PYBIND11_MODULE(_cucim, m) py::class_>(m, "CuImage", py::dynamic_attr()) // .def(py::init(), doc::CuImage::doc_CuImage, py::call_guard(), // py::arg("path")) // - .def_static("cache", &py_cache, doc::CuImage::doc_cache, py::call_guard(), // + .def_static("cache", &py_cache, doc::CuImage::doc_cache, // py::arg("type") = py::none()) // .def_static("profiler", &py_profiler, doc::CuImage::doc_profiler, py::call_guard()) // .def_property_readonly_static("is_trace_enabled", &py_is_trace_enabled, doc::CuImage::doc_is_trace_enabled, @@ -200,7 +200,7 @@ PYBIND11_MODULE(_cucim, m) }, py::call_guard()); - py::class_>(m, "CuImageIterator") // + py::class_, std::shared_ptr>>(m, "CuImageIterator") // .def(py::init, bool>(), doc::CuImageIterator::doc_CuImageIterator, py::arg("cuimg"), // py::arg("ending") = false, py::call_guard()) @@ -212,8 +212,8 @@ PYBIND11_MODULE(_cucim, m) py::call_guard()) .def( "__iter__", // - [](CuImageIterator& it) { // - return CuImageIterator(it); // + [](CuImageIterator* it) { // + return it; // }, // py::call_guard()) .def("__next__", &py_cuimage_iterator_next, py::call_guard()) @@ -517,11 +517,11 @@ py::object py_read_region(const CuImage& cuimg, auto loader = region_ptr->loader(); if (batch_size > 1 || (loader && loader->size() > 1)) { - auto iter_ptr = region_ptr->begin(); + auto iter_ptr = std::make_shared>(region_ptr->shared_from_this()); py::gil_scoped_acquire scope_guard; - py::object iter = py::cast(iter_ptr); + py::object iter = py::cast(iter_ptr, py::return_value_policy::take_ownership); return iter; } @@ -557,6 +557,10 @@ py::object py_associated_image(const CuImage& cuimg, const std::string& name, co py::object py_cuimage_iterator_next(CuImageIterator& it) { bool stop_iteration = (it.index() == it.size()); + if (stop_iteration) + { + throw py::stop_iteration(); + } // Get the next batch of images. ++it; @@ -573,10 +577,6 @@ py::object py_cuimage_iterator_next(CuImageIterator& it) { _set_array_interface(cuimg_obj); } - if (stop_iteration) - { - throw py::stop_iteration(); - } return cuimg_obj; } }