Skip to content

Commit

Permalink
Fix iterator-related memory issues (#620)
Browse files Browse the repository at this point in the history
MERGE after #618

### Fix memory corruption issue in ThreadBatchDataLoader

Fix #592    

- Return a shared pointer of CuImageIterator, taking ownership of CuImageIterator instance, instead of returning a copy of CuImageIterator
- Do not create new instance for CuImageIterator when `__iter__` is called. Instead, return self.
- Wait until all threads are finished in ThreadBatchDataLoader destructor

### Fix memory leak in ThreadBatchDataLoader

Fix #598
    
- Throw StopIteration exception early
- Do not set nullptr for `raster_data_` in `ThreadBatchDataLoader::next_data()`
  - Setting nullptr prevents freeing memory in the destructor of ThreadBatchDataLoader

### Fix fatal errors for cache object

- GIL is not acquired when using Python object.

### Add iterator-related test cases for memory issue

Authors:
  - Gigon Bae (https://github.com/gigony)

Approvers:
  - Gregory Lee (https://github.com/grlee77)
  - https://github.com/jakirkham

URL: #620
  • Loading branch information
gigony authored Oct 30, 2023
1 parent cd54957 commit f95e100
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 25 deletions.
14 changes: 7 additions & 7 deletions cpp/src/loader/thread_batch_data_loader.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Apache License, Version 2.0
* Copyright 2021 NVIDIA Corporation
* Copyright 2021-2023 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -86,6 +86,9 @@ ThreadBatchDataLoader::ThreadBatchDataLoader(LoadFunc load_func,

ThreadBatchDataLoader::~ThreadBatchDataLoader()
{
// Wait until all tasks are done.
while (wait_batch() > 0);

cucim::io::DeviceType device_type = out_device_.type();
for (auto& raster_ptr : raster_data_)
{
Expand Down Expand Up @@ -207,19 +210,16 @@ uint8_t* ThreadBatchDataLoader::next_data()
{
if (num_workers_ == 0) // (location_len == 1 && batch_size == 1)
{
// If it reads entire image with multi threads (using loader), release raster memory from batch data loader.
// If it reads entire image with multi threads (using loader), release raster memory from batch data loader
// by setting it to nullptr so that it will not be freed by ~ThreadBatchDataLoader (destructor).
uint8_t* batch_raster_ptr = raster_data_[0];
raster_data_[0] = nullptr;
return batch_raster_ptr;
}

if (processed_batch_count_ * batch_size_ >= location_len_)
{
// Remove buffer items that are no longer needed.
for (size_t i = 0; i < buffer_item_len_; ++i)
{
raster_data_[i] = nullptr;
}
// If all batches are processed, return nullptr.
return nullptr;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2023, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -95,3 +95,73 @@ def test_read_random_region_cpu_memleak(testimg_tiff_stripe_4096x4096_256):

# Memory usage difference should be smaller than (iteration) * 100 bytes
assert mem_usage_history[-1] - mem_usage_history[1] < iteration * 100


def test_tiff_iterator(testimg_tiff_stripe_4096x4096_256):
"""Verify that the iterator of read_region does not cause a memory leak.
See issue gh-598: https://github.com/rapidsai/cucim/issues/598
"""

import numpy as np
import random
import os
import psutil

slide_path = testimg_tiff_stripe_4096x4096_256

level = 0
size = (300, 300)
batch_size = 64
num_workers = 16

# This may be a non-integer number, to have the last iteration return a
# number of patches < batch_size
num_batches = 4.3
locations = [
(random.randint(0, 1000), random.randint(2000, 3000))
for _ in range(int(batch_size * num_batches))
]
print(
f"Number of locations: {len(locations)}, batch size: {batch_size}, "
"number of workers: {num_workers}"
)

def get_total_mem_usage():
current_process = psutil.Process(os.getpid())
mem = current_process.memory_info().rss
for child in current_process.children(recursive=True):
mem += child.memory_info().rss
return mem

with open_image_cucim(slide_path) as slide:
start_mem = None
mem_usage_history = []
for i in range(101):
gen = slide.read_region(
locations,
size,
level,
batch_size=batch_size,
num_workers=num_workers,
)
for x in gen:
_ = np.asarray(x)

if i % 10 == 0:
if start_mem is None:
start_mem = get_total_mem_usage()
print(
f"starting with: {(start_mem) // (1024 * 1024)}"
" MB of memory consumption"
)
else:
memory_increase = (get_total_mem_usage() - start_mem) // (
1024 * 1024
)
mem_usage_history.append(memory_increase)
print(
f"mem increase (iteration: {i:3d}): "
"{memory_increase:4d} MB"
)
# Memory usage difference should be less than 20MB
assert mem_usage_history[-1] - mem_usage_history[1] < 20
39 changes: 38 additions & 1 deletion python/cucim/tests/unit/clara/test_tiff_read_region.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -196,3 +196,40 @@ def test_cuda_array_interface_support(testimg_tiff_stripe_32x24_16_jpeg):
assert array_interface["version"] == 3
assert array_interface["mask"] is None
assert array_interface["stream"] == 1


def test_tiff_iterator(testimg_tiff_stripe_4096x4096_256):
"""Test that the iterator of read_region works as expected.
See issue gh-592: https://github.com/rapidsai/cucim/issues/592
"""

level = 0
size = (256, 256)

with open_image_cucim(testimg_tiff_stripe_4096x4096_256) as slide:
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)

with open_image_cucim(testimg_tiff_stripe_4096x4096_256) as slide:
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
_ = next(x)
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)

with open_image_cucim(testimg_tiff_stripe_4096x4096_256) as slide:
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
_ = next(x)
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
_ = next(x)
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
_ = next(x)
x = slide.read_region(
[(0, 0), (0, 0), (0, 0)], size, level, num_workers=1
)
_ = next(x)
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
_ = next(x)
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
_ = next(x)
x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
_ = next(x)
9 changes: 4 additions & 5 deletions python/pybind11/cache/cache_py.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -53,22 +53,21 @@ void init_cache(py::module& cache)
py::call_guard<py::gil_scoped_release>())
.def_property("free_memory", &ImageCache::free_memory, nullptr, doc::ImageCache::doc_free_memory,
py::call_guard<py::gil_scoped_release>())
.def("record", &py_record, doc::ImageCache::doc_record, py::call_guard<py::gil_scoped_release>(), //
.def("record", &py_record, doc::ImageCache::doc_record, //
py::arg("value") = py::none())
.def_property("hit_count", &ImageCache::hit_count, nullptr, doc::ImageCache::doc_hit_count,
py::call_guard<py::gil_scoped_release>())
.def_property("miss_count", &ImageCache::miss_count, nullptr, doc::ImageCache::doc_miss_count,
py::call_guard<py::gil_scoped_release>())
.def("reserve", &py_image_cache_reserve, doc::ImageCache::doc_reserve, py::call_guard<py::gil_scoped_release>(), //
.def("reserve", &py_image_cache_reserve, doc::ImageCache::doc_reserve, //
py::arg("memory_capacity"));

cache.def("preferred_memory_capacity", &py_preferred_memory_capacity, doc::doc_preferred_memory_capacity, //
py::arg("img") = py::none(), //
py::arg("image_size") = std::nullopt, //
py::arg("tile_size") = std::nullopt, //
py::arg("patch_size") = std::nullopt, //
py::arg("bytes_per_pixel") = 3, //
py::call_guard<py::gil_scoped_release>());
py::arg("bytes_per_pixel") = 3);
}

bool py_record(ImageCache& cache, py::object value)
Expand Down
22 changes: 11 additions & 11 deletions python/pybind11/cucim_py.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -107,7 +107,7 @@ PYBIND11_MODULE(_cucim, m)
py::class_<CuImage, std::shared_ptr<CuImage>>(m, "CuImage", py::dynamic_attr()) //
.def(py::init<const std::string&>(), doc::CuImage::doc_CuImage, py::call_guard<py::gil_scoped_release>(), //
py::arg("path")) //
.def_static("cache", &py_cache, doc::CuImage::doc_cache, py::call_guard<py::gil_scoped_release>(), //
.def_static("cache", &py_cache, doc::CuImage::doc_cache, //
py::arg("type") = py::none()) //
.def_static("profiler", &py_profiler, doc::CuImage::doc_profiler, py::call_guard<py::gil_scoped_release>()) //
.def_property_readonly_static("is_trace_enabled", &py_is_trace_enabled, doc::CuImage::doc_is_trace_enabled,
Expand Down Expand Up @@ -200,7 +200,7 @@ PYBIND11_MODULE(_cucim, m)
},
py::call_guard<py::gil_scoped_release>());

py::class_<CuImageIterator<CuImage>>(m, "CuImageIterator") //
py::class_<CuImageIterator<CuImage>, std::shared_ptr<CuImageIterator<CuImage>>>(m, "CuImageIterator") //
.def(py::init<std::shared_ptr<CuImage>, bool>(), doc::CuImageIterator::doc_CuImageIterator,
py::arg("cuimg"), //
py::arg("ending") = false, py::call_guard<py::gil_scoped_release>())
Expand All @@ -212,8 +212,8 @@ PYBIND11_MODULE(_cucim, m)
py::call_guard<py::gil_scoped_release>())
.def(
"__iter__", //
[](CuImageIterator<CuImage>& it) { //
return CuImageIterator<CuImage>(it); //
[](CuImageIterator<CuImage>* it) { //
return it; //
}, //
py::call_guard<py::gil_scoped_release>())
.def("__next__", &py_cuimage_iterator_next, py::call_guard<py::gil_scoped_release>())
Expand Down Expand Up @@ -517,11 +517,11 @@ py::object py_read_region(const CuImage& cuimg,
auto loader = region_ptr->loader();
if (batch_size > 1 || (loader && loader->size() > 1))
{
auto iter_ptr = region_ptr->begin();
auto iter_ptr = std::make_shared<CuImageIterator<CuImage>>(region_ptr->shared_from_this());

py::gil_scoped_acquire scope_guard;

py::object iter = py::cast(iter_ptr);
py::object iter = py::cast(iter_ptr, py::return_value_policy::take_ownership);

return iter;
}
Expand Down Expand Up @@ -557,6 +557,10 @@ py::object py_associated_image(const CuImage& cuimg, const std::string& name, co
py::object py_cuimage_iterator_next(CuImageIterator<CuImage>& it)
{
bool stop_iteration = (it.index() == it.size());
if (stop_iteration)
{
throw py::stop_iteration();
}

// Get the next batch of images.
++it;
Expand All @@ -573,10 +577,6 @@ py::object py_cuimage_iterator_next(CuImageIterator<CuImage>& it)
{
_set_array_interface(cuimg_obj);
}
if (stop_iteration)
{
throw py::stop_iteration();
}
return cuimg_obj;
}
}
Expand Down

0 comments on commit f95e100

Please sign in to comment.