From f95e1002fe57314403d36032569f7168f8e766c6 Mon Sep 17 00:00:00 2001
From: Gigon Bae <gbae@nvidia.com>
Date: Mon, 30 Oct 2023 16:14:33 -0700
Subject: [PATCH] Fix iterator-related memory issues (#620)

MERGE after https://github.com/rapidsai/cucim/pull/618

### Fix memory corruption issue in ThreadBatchDataLoader

Fix #592

- Return a shared pointer of CuImageIterator, taking ownership of CuImageIterator instance, instead of returning a copy of CuImageIterator
- Do not create new instance for CuImageIterator when `__iter__` is called. Instead, return self.
- Wait until all threads are finished in ThreadBatchDataLoader destructor

### Fix memory leak in ThreadBatchDataLoader

Fix #598

- Throw StopIteration exception early
- Do not set nullptr for `raster_data_` in `ThreadBatchDataLoader::next_data()`
  - Setting nullptr prevents freeing memory in the destructor of ThreadBatchDataLoader

### Fix fatal errors for cache object

- GIL is not acquired when using Python object.

### Add iterator-related test cases for memory issue

Authors:
  - Gigon Bae (https://github.com/gigony)

Approvers:
  - Gregory Lee (https://github.com/grlee77)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cucim/pull/620
---
 cpp/src/loader/thread_batch_data_loader.cpp   | 14 ++--
 .../clara/test_read_region_memory_usage.py    | 72 ++++++++++++++++++-
 .../tests/unit/clara/test_tiff_read_region.py | 39 +++++++++-
 python/pybind11/cache/cache_py.cpp            |  9 ++-
 python/pybind11/cucim_py.cpp                  | 22 +++---
 5 files changed, 131 insertions(+), 25 deletions(-)

diff --git a/cpp/src/loader/thread_batch_data_loader.cpp b/cpp/src/loader/thread_batch_data_loader.cpp
index 89de8e677..9aff2d88c 100644
--- a/cpp/src/loader/thread_batch_data_loader.cpp
+++ b/cpp/src/loader/thread_batch_data_loader.cpp
@@ -1,6 +1,6 @@
 /*
  * Apache License, Version 2.0
- * Copyright 2021 NVIDIA Corporation
+ * Copyright 2021-2023 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,6 +86,9 @@ ThreadBatchDataLoader::ThreadBatchDataLoader(LoadFunc load_func,
 
 ThreadBatchDataLoader::~ThreadBatchDataLoader()
 {
+    // Wait until all tasks are done.
+    while (wait_batch() > 0);
+
     cucim::io::DeviceType device_type = out_device_.type();
     for (auto& raster_ptr : raster_data_)
     {
@@ -207,7 +210,8 @@ uint8_t* ThreadBatchDataLoader::next_data()
 {
     if (num_workers_ == 0) // (location_len == 1 && batch_size == 1)
     {
-        // If it reads entire image with multi threads (using loader), release raster memory from batch data loader.
+        // If it reads entire image with multi threads (using loader), release raster memory from batch data loader
+        // by setting it to nullptr so that it will not be freed by ~ThreadBatchDataLoader (destructor).
         uint8_t* batch_raster_ptr = raster_data_[0];
         raster_data_[0] = nullptr;
         return batch_raster_ptr;
@@ -215,11 +219,7 @@ uint8_t* ThreadBatchDataLoader::next_data()
 
     if (processed_batch_count_ * batch_size_ >= location_len_)
     {
-        // Remove buffer items that are no longer needed.
-        for (size_t i = 0; i < buffer_item_len_; ++i)
-        {
-            raster_data_[i] = nullptr;
-        }
+        // If all batches are processed, return nullptr.
         return nullptr;
     }
 
diff --git a/python/cucim/tests/performance/clara/test_read_region_memory_usage.py b/python/cucim/tests/performance/clara/test_read_region_memory_usage.py
index aba340eea..5597e0866 100644
--- a/python/cucim/tests/performance/clara/test_read_region_memory_usage.py
+++ b/python/cucim/tests/performance/clara/test_read_region_memory_usage.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -95,3 +95,73 @@ def test_read_random_region_cpu_memleak(testimg_tiff_stripe_4096x4096_256):
 
     # Memory usage difference should be smaller than (iteration) * 100 bytes
     assert mem_usage_history[-1] - mem_usage_history[1] < iteration * 100
+
+
+def test_tiff_iterator(testimg_tiff_stripe_4096x4096_256):
+    """Verify that the iterator of read_region does not cause a memory leak.
+    See issue gh-598: https://github.com/rapidsai/cucim/issues/598
+    """
+
+    import numpy as np
+    import random
+    import os
+    import psutil
+
+    slide_path = testimg_tiff_stripe_4096x4096_256
+
+    level = 0
+    size = (300, 300)
+    batch_size = 64
+    num_workers = 16
+
+    # This may be a non-integer number, to have the last iteration return a
+    # number of patches < batch_size
+    num_batches = 4.3
+    locations = [
+        (random.randint(0, 1000), random.randint(2000, 3000))
+        for _ in range(int(batch_size * num_batches))
+    ]
+    print(
+        f"Number of locations: {len(locations)}, batch size: {batch_size}, "
+        "number of workers: {num_workers}"
+    )
+
+    def get_total_mem_usage():
+        current_process = psutil.Process(os.getpid())
+        mem = current_process.memory_info().rss
+        for child in current_process.children(recursive=True):
+            mem += child.memory_info().rss
+        return mem
+
+    with open_image_cucim(slide_path) as slide:
+        start_mem = None
+        mem_usage_history = []
+        for i in range(101):
+            gen = slide.read_region(
+                locations,
+                size,
+                level,
+                batch_size=batch_size,
+                num_workers=num_workers,
+            )
+            for x in gen:
+                _ = np.asarray(x)
+
+            if i % 10 == 0:
+                if start_mem is None:
+                    start_mem = get_total_mem_usage()
+                    print(
+                        f"starting with: {(start_mem) // (1024 * 1024)}"
+                        " MB of memory consumption"
+                    )
+                else:
+                    memory_increase = (get_total_mem_usage() - start_mem) // (
+                        1024 * 1024
+                    )
+                    mem_usage_history.append(memory_increase)
+                    print(
+                        f"mem increase (iteration: {i:3d}): "
+                        "{memory_increase:4d} MB"
+                    )
+        # Memory usage difference should be less than 20MB
+        assert mem_usage_history[-1] - mem_usage_history[1] < 20
diff --git a/python/cucim/tests/unit/clara/test_tiff_read_region.py b/python/cucim/tests/unit/clara/test_tiff_read_region.py
index 9fafe431c..5b122fb50 100644
--- a/python/cucim/tests/unit/clara/test_tiff_read_region.py
+++ b/python/cucim/tests/unit/clara/test_tiff_read_region.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -196,3 +196,40 @@ def test_cuda_array_interface_support(testimg_tiff_stripe_32x24_16_jpeg):
     assert array_interface["version"] == 3
     assert array_interface["mask"] is None
     assert array_interface["stream"] == 1
+
+
+def test_tiff_iterator(testimg_tiff_stripe_4096x4096_256):
+    """Test that the iterator of read_region works as expected.
+    See issue gh-592: https://github.com/rapidsai/cucim/issues/592
+    """
+
+    level = 0
+    size = (256, 256)
+
+    with open_image_cucim(testimg_tiff_stripe_4096x4096_256) as slide:
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+
+    with open_image_cucim(testimg_tiff_stripe_4096x4096_256) as slide:
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+        _ = next(x)
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+
+    with open_image_cucim(testimg_tiff_stripe_4096x4096_256) as slide:
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+        _ = next(x)
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+        _ = next(x)
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+        _ = next(x)
+        x = slide.read_region(
+            [(0, 0), (0, 0), (0, 0)], size, level, num_workers=1
+        )
+        _ = next(x)
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+        _ = next(x)
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+        _ = next(x)
+        x = slide.read_region([(0, 0), (0, 0)], size, level, num_workers=1)
+        _ = next(x)
diff --git a/python/pybind11/cache/cache_py.cpp b/python/pybind11/cache/cache_py.cpp
index 6b176e93e..b8f09f8ed 100644
--- a/python/pybind11/cache/cache_py.cpp
+++ b/python/pybind11/cache/cache_py.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,13 +53,13 @@ void init_cache(py::module& cache)
                       py::call_guard<py::gil_scoped_release>())
         .def_property("free_memory", &ImageCache::free_memory, nullptr, doc::ImageCache::doc_free_memory,
                       py::call_guard<py::gil_scoped_release>())
-        .def("record", &py_record, doc::ImageCache::doc_record, py::call_guard<py::gil_scoped_release>(), //
+        .def("record", &py_record, doc::ImageCache::doc_record, //
              py::arg("value") = py::none())
         .def_property("hit_count", &ImageCache::hit_count, nullptr, doc::ImageCache::doc_hit_count,
                       py::call_guard<py::gil_scoped_release>())
         .def_property("miss_count", &ImageCache::miss_count, nullptr, doc::ImageCache::doc_miss_count,
                       py::call_guard<py::gil_scoped_release>())
-        .def("reserve", &py_image_cache_reserve, doc::ImageCache::doc_reserve, py::call_guard<py::gil_scoped_release>(), //
+        .def("reserve", &py_image_cache_reserve, doc::ImageCache::doc_reserve, //
              py::arg("memory_capacity"));
 
     cache.def("preferred_memory_capacity", &py_preferred_memory_capacity, doc::doc_preferred_memory_capacity, //
@@ -67,8 +67,7 @@ void init_cache(py::module& cache)
               py::arg("image_size") = std::nullopt, //
               py::arg("tile_size") = std::nullopt, //
               py::arg("patch_size") = std::nullopt, //
-              py::arg("bytes_per_pixel") = 3, //
-              py::call_guard<py::gil_scoped_release>());
+              py::arg("bytes_per_pixel") = 3);
 }
 
 bool py_record(ImageCache& cache, py::object value)
diff --git a/python/pybind11/cucim_py.cpp b/python/pybind11/cucim_py.cpp
index c3c88bb34..62f041e2d 100644
--- a/python/pybind11/cucim_py.cpp
+++ b/python/pybind11/cucim_py.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,7 +107,7 @@ PYBIND11_MODULE(_cucim, m)
     py::class_<CuImage, std::shared_ptr<CuImage>>(m, "CuImage", py::dynamic_attr()) //
         .def(py::init<const std::string&>(), doc::CuImage::doc_CuImage, py::call_guard<py::gil_scoped_release>(), //
              py::arg("path")) //
-        .def_static("cache", &py_cache, doc::CuImage::doc_cache, py::call_guard<py::gil_scoped_release>(), //
+        .def_static("cache", &py_cache, doc::CuImage::doc_cache, //
                     py::arg("type") = py::none()) //
         .def_static("profiler", &py_profiler, doc::CuImage::doc_profiler, py::call_guard<py::gil_scoped_release>()) //
         .def_property_readonly_static("is_trace_enabled", &py_is_trace_enabled, doc::CuImage::doc_is_trace_enabled,
@@ -200,7 +200,7 @@ PYBIND11_MODULE(_cucim, m)
             },
             py::call_guard<py::gil_scoped_release>());
 
-    py::class_<CuImageIterator<CuImage>>(m, "CuImageIterator") //
+    py::class_<CuImageIterator<CuImage>, std::shared_ptr<CuImageIterator<CuImage>>>(m, "CuImageIterator") //
         .def(py::init<std::shared_ptr<CuImage>, bool>(), doc::CuImageIterator::doc_CuImageIterator,
              py::arg("cuimg"), //
              py::arg("ending") = false, py::call_guard<py::gil_scoped_release>())
@@ -212,8 +212,8 @@ PYBIND11_MODULE(_cucim, m)
             py::call_guard<py::gil_scoped_release>())
         .def(
             "__iter__", //
-            [](CuImageIterator<CuImage>& it) { //
-                return CuImageIterator<CuImage>(it); //
+            [](CuImageIterator<CuImage>* it) { //
+                return it; //
             }, //
             py::call_guard<py::gil_scoped_release>())
         .def("__next__", &py_cuimage_iterator_next, py::call_guard<py::gil_scoped_release>())
@@ -517,11 +517,11 @@ py::object py_read_region(const CuImage& cuimg,
     auto loader = region_ptr->loader();
     if (batch_size > 1 || (loader && loader->size() > 1))
     {
-        auto iter_ptr = region_ptr->begin();
+        auto iter_ptr = std::make_shared<CuImageIterator<CuImage>>(region_ptr->shared_from_this());
 
         py::gil_scoped_acquire scope_guard;
 
-        py::object iter = py::cast(iter_ptr);
+        py::object iter = py::cast(iter_ptr, py::return_value_policy::take_ownership);
 
         return iter;
     }
@@ -557,6 +557,10 @@ py::object py_associated_image(const CuImage& cuimg, const std::string& name, co
 py::object py_cuimage_iterator_next(CuImageIterator<CuImage>& it)
 {
     bool stop_iteration = (it.index() == it.size());
+    if (stop_iteration)
+    {
+        throw py::stop_iteration();
+    }
 
     // Get the next batch of images.
     ++it;
@@ -573,10 +577,6 @@ py::object py_cuimage_iterator_next(CuImageIterator<CuImage>& it)
         {
             _set_array_interface(cuimg_obj);
         }
-        if (stop_iteration)
-        {
-            throw py::stop_iteration();
-        }
         return cuimg_obj;
     }
 }