Skip to content

Commit

Permalink
Correctly use dtype when computing shared memory requirements of sepa…
Browse files Browse the repository at this point in the history
…rable convolution (#409)

closes #408

An error in the keyword arguments passed to the shared memory calculation is fixed in the first commit here. In the second commit, a second fallback is added in case of any other anticipated compilation failure.

Authors:
  - Gregory Lee (https://github.com/grlee77)

Approvers:
  - https://github.com/jakirkham
  - Gigon Bae (https://github.com/gigony)

URL: #409
  • Loading branch information
grlee77 authored Oct 5, 2022
1 parent 0de4b89 commit f047273
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
24 changes: 17 additions & 7 deletions python/cucim/src/cucim/skimage/_vendored/_ndimage_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@
from cucim.skimage._vendored import _ndimage_filters_core as _filters_core
from cucim.skimage._vendored import _ndimage_util as _util

from cucim.skimage.filters._separable_filtering import (ResourceLimitError,
_shmem_convolve1d)
try:
from cupy.cuda.compiler import CompileException
compile_errors = (ResourceLimitError, CompileException)
except ImportError:
compile_errors = (ResourceLimitError,)


def correlate(input, weights, output=None, mode='reflect', cval=0.0, origin=0):
"""Multi-dimensional correlate.
Expand Down Expand Up @@ -182,7 +190,9 @@ def _correlate_or_convolve1d(input, weights, axis, output, mode, cval, origin,
convolution=False, algorithm=None):
# Calls fast shared-memory convolution when possible, otherwise falls back
# to the vendored elementwise _correlate_or_convolve
default_algorithm = False
if algorithm is None:
default_algorithm = True
if input.ndim == 2 and weights.size <= 256:
algorithm = 'shared_memory'
else:
Expand All @@ -194,8 +204,6 @@ def _correlate_or_convolve1d(input, weights, axis, output, mode, cval, origin,
if mode == 'wrap':
mode = 'grid-wrap'
if algorithm == 'shared_memory':
from cucim.skimage.filters._separable_filtering import (
ResourceLimitError, _shmem_convolve1d)
if input.ndim not in [2, 3]:
raise NotImplementedError(
f"shared_memory not implemented for ndim={input.ndim}"
Expand All @@ -205,12 +213,14 @@ def _correlate_or_convolve1d(input, weights, axis, output, mode, cval, origin,
mode=mode, cval=cval, origin=origin,
convolution=convolution)
return out
except ResourceLimitError:
except compile_errors:
# fallback to elementwise if inadequate shared memory available
warnings.warn(
"Inadequate resources for algorithm='shared_memory: "
"falling back to the elementwise implementation"
)
if not default_algorithm:
# only warn if 'shared_memory' was explicitly requested
warnings.warn(
"Inadequate resources for algorithm='shared_memory: "
"falling back to the elementwise implementation"
)
algorithm = 'elementwise'
if algorithm == 'elementwise':
weights, origins = _filters_core._convert_1d_args(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _get_constants(ndim, axis, kernel_size, anchor, patch_per_block=None):
return block, patch_per_block, halo_size


def _get_smem_shape(ndim, axis, block, patch_per_block, halo_size, anchor=None,
def _get_smem_shape(ndim, axis, block, patch_per_block, halo_size,
image_dtype=cp.float32):
bx, by, bz = block
if ndim == 2:
Expand Down Expand Up @@ -128,7 +128,12 @@ def _check_smem_availability(ndim, axis, kernel_size, anchor=None,
ndim, axis, kernel_size, anchor=anchor, patch_per_block=patch_per_block
)
shape, nbytes = _get_smem_shape(
ndim, axis, block, patch_per_block, halo_size, image_dtype
ndim=ndim,
axis=axis,
block=block,
patch_per_block=patch_per_block,
halo_size=halo_size,
image_dtype=image_dtype
)
props = _get_shmem_limits(device_id=device_id)
if nbytes > props['shared_block']:
Expand Down
12 changes: 12 additions & 0 deletions python/cucim/src/cucim/skimage/filters/tests/test_gaussian.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,15 @@ def test_dog_invalid_sigma2():
difference_of_gaussians(image, 3, 2)
with pytest.raises(ValueError):
difference_of_gaussians(image, (1, 5), (2, 4))


@pytest.mark.parametrize(
'dtype', [cp.uint8, cp.float16, cp.float32, cp.float64]
)
@pytest.mark.parametrize('sigma', range(1, 40, 5))
def test_shared_mem_check_fix(dtype, sigma):
# Verify fix for gh-408 (no compilation errors occur).
# Prior to the fix in gh-409, some float64 cases failed.
# The exact range that fails depends on the shared memory limit
# of the GPU, so we test with a range of sigmas here.
gaussian(cp.ones((512, 512), dtype=dtype), sigma=sigma)

0 comments on commit f047273

Please sign in to comment.