Correctly use dtype when computing shared memory requirements of sepa…

…rable convolution (#409) closes #408 An error in the keyword arguments passed to the shared memory calculation is fixed in the first commit here. In the second commit, a second fallback is added in case of any other anticipated compilation failure. Authors: - Gregory Lee (https://github.com/grlee77) Approvers: - https://github.com/jakirkham - Gigon Bae (https://github.com/gigony) URL: #409
rapidsai · Oct 5, 2022 · f047273 · f047273
1 parent 0de4b89
commit f047273
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 9 deletions.
diff --git a/python/cucim/src/cucim/skimage/_vendored/_ndimage_filters.py b/python/cucim/src/cucim/skimage/_vendored/_ndimage_filters.py
@@ -8,6 +8,14 @@
 from cucim.skimage._vendored import _ndimage_filters_core as _filters_core
 from cucim.skimage._vendored import _ndimage_util as _util
 
+from cucim.skimage.filters._separable_filtering import (ResourceLimitError,
+                                                        _shmem_convolve1d)
+try:
+    from cupy.cuda.compiler import CompileException
+    compile_errors = (ResourceLimitError, CompileException)
+except ImportError:
+    compile_errors = (ResourceLimitError,)
+
 
 def correlate(input, weights, output=None, mode='reflect', cval=0.0, origin=0):
     """Multi-dimensional correlate.
@@ -182,7 +190,9 @@ def _correlate_or_convolve1d(input, weights, axis, output, mode, cval, origin,
                              convolution=False, algorithm=None):
     # Calls fast shared-memory convolution when possible, otherwise falls back
     # to the vendored elementwise _correlate_or_convolve
+    default_algorithm = False
     if algorithm is None:
+        default_algorithm = True
         if input.ndim == 2 and weights.size <= 256:
             algorithm = 'shared_memory'
         else:
@@ -194,8 +204,6 @@ def _correlate_or_convolve1d(input, weights, axis, output, mode, cval, origin,
     if mode == 'wrap':
         mode = 'grid-wrap'
     if algorithm == 'shared_memory':
-        from cucim.skimage.filters._separable_filtering import (
-            ResourceLimitError, _shmem_convolve1d)
         if input.ndim not in [2, 3]:
             raise NotImplementedError(
                 f"shared_memory not implemented for ndim={input.ndim}"
@@ -205,12 +213,14 @@ def _correlate_or_convolve1d(input, weights, axis, output, mode, cval, origin,
                                     mode=mode, cval=cval, origin=origin,
                                     convolution=convolution)
             return out
-        except ResourceLimitError:
+        except compile_errors:
             # fallback to elementwise if inadequate shared memory available
-            warnings.warn(
-                "Inadequate resources for algorithm='shared_memory: "
-                "falling back to the elementwise implementation"
-            )
+            if not default_algorithm:
+                # only warn if 'shared_memory' was explicitly requested
+                warnings.warn(
+                    "Inadequate resources for algorithm='shared_memory: "
+                    "falling back to the elementwise implementation"
+                )
             algorithm = 'elementwise'
     if algorithm == 'elementwise':
         weights, origins = _filters_core._convert_1d_args(

diff --git a/python/cucim/src/cucim/skimage/filters/_separable_filtering.py b/python/cucim/src/cucim/skimage/filters/_separable_filtering.py
@@ -63,7 +63,7 @@ def _get_constants(ndim, axis, kernel_size, anchor, patch_per_block=None):
     return block, patch_per_block, halo_size
 
 
-def _get_smem_shape(ndim, axis, block, patch_per_block, halo_size, anchor=None,
+def _get_smem_shape(ndim, axis, block, patch_per_block, halo_size,
                     image_dtype=cp.float32):
     bx, by, bz = block
     if ndim == 2:
@@ -128,7 +128,12 @@ def _check_smem_availability(ndim, axis, kernel_size, anchor=None,
         ndim, axis, kernel_size, anchor=anchor, patch_per_block=patch_per_block
     )
     shape, nbytes = _get_smem_shape(
-        ndim, axis, block, patch_per_block, halo_size, image_dtype
+        ndim=ndim,
+        axis=axis,
+        block=block,
+        patch_per_block=patch_per_block,
+        halo_size=halo_size,
+        image_dtype=image_dtype
     )
     props = _get_shmem_limits(device_id=device_id)
     if nbytes > props['shared_block']:

diff --git a/python/cucim/src/cucim/skimage/filters/tests/test_gaussian.py b/python/cucim/src/cucim/skimage/filters/tests/test_gaussian.py
@@ -184,3 +184,15 @@ def test_dog_invalid_sigma2():
         difference_of_gaussians(image, 3, 2)
     with pytest.raises(ValueError):
         difference_of_gaussians(image, (1, 5), (2, 4))
+
+
+@pytest.mark.parametrize(
+    'dtype', [cp.uint8, cp.float16, cp.float32, cp.float64]
+)
+@pytest.mark.parametrize('sigma', range(1, 40, 5))
+def test_shared_mem_check_fix(dtype, sigma):
+    # Verify fix for gh-408 (no compilation errors occur).
+    # Prior to the fix in gh-409, some float64 cases failed.
+    # The exact range that fails depends on the shared memory limit
+    # of the GPU, so we test with a range of sigmas here.
+    gaussian(cp.ones((512, 512), dtype=dtype), sigma=sigma)