From d738c388e08acfe35e4cc45dec8863238e62ef31 Mon Sep 17 00:00:00 2001 From: James Bowley <12133430+cudawarped@users.noreply.github.com> Date: Sat, 15 Oct 2022 10:03:00 +0300 Subject: [PATCH] Replace all instances of texture references with texture objects using the existing updated cv::cudev::Texture class. Fixes bugs in cv::cuda::demosaicing, cv::cuda::resize and cv::cuda::HoughSegmentDetector. --- modules/cudaimgproc/src/cuda/canny.cu | 217 +-------- modules/cudaimgproc/src/cuda/corners.cu | 55 +-- modules/cudaimgproc/src/cuda/debayer.cu | 48 +- .../cudaimgproc/src/cuda/hough_segments.cu | 25 +- modules/cudaimgproc/src/cuda/mean_shift.cu | 39 +- modules/cudaimgproc/test/test_color.cpp | 18 +- modules/cudaimgproc/test/test_hough.cpp | 64 ++- modules/cudaimgproc/test/test_precomp.hpp | 2 + .../include/opencv2/cudalegacy/NCV.hpp | 4 +- .../opencv2/cudalegacy/NPP_staging.hpp | 10 +- .../cudalegacy/src/cuda/NCVBroxOpticalFlow.cu | 363 ++++----------- .../src/cuda/NCVHaarObjectDetection.cu | 352 ++++---------- modules/cudalegacy/src/cuda/NPP_staging.cu | 428 +++++------------ modules/cudalegacy/src/cuda/bm.cu | 24 +- .../cudalegacy/test/TestHypothesesGrow.cpp | 3 +- modules/cudaobjdetect/src/cuda/hog.cu | 41 +- modules/cudaobjdetect/test/test_objdetect.cpp | 13 +- modules/cudaoptflow/src/cuda/pyrlk.cu | 306 +++---------- modules/cudaoptflow/src/cuda/tvl1flow.cu | 113 +---- modules/cudastereo/src/cuda/stereobm.cu | 39 +- modules/cudawarping/src/cuda/remap.cu | 177 ++++---- modules/cudawarping/src/cuda/resize.cu | 108 ++--- modules/cudawarping/src/cuda/warp.cu | 149 ++---- modules/cudawarping/test/test_precomp.hpp | 2 + modules/cudawarping/test/test_resize.cpp | 54 +++ .../include/opencv2/cudev/ptr2d/texture.hpp | 429 +++++++++--------- modules/xfeatures2d/src/cuda/surf.cu | 140 +++--- modules/xfeatures2d/src/surf.cuda.cpp | 34 +- 28 files changed, 1065 insertions(+), 2192 deletions(-) diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu index 99a4f72a8fe..61ea11ee322 100644 --- a/modules/cudaimgproc/src/cuda/canny.cu +++ b/modules/cudaimgproc/src/cuda/canny.cu @@ -48,6 +48,7 @@ #include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/utility.hpp" #include "opencv2/core/cuda.hpp" +#include using namespace cv::cuda; using namespace cv::cuda::device; @@ -90,47 +91,8 @@ namespace cv { namespace cuda { namespace device namespace canny { - struct SrcTex - { - virtual ~SrcTex() {} - - __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} - - __device__ __forceinline__ virtual int operator ()(int y, int x) const = 0; - - int xoff; - int yoff; - }; - - texture tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); - struct SrcTexRef : SrcTex - { - __host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {} - - __device__ __forceinline__ int operator ()(int y, int x) const override - { - return tex2D(tex_src, x + xoff, y + yoff); - } - }; - - struct SrcTexObj : SrcTex - { - __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { } - - __device__ __forceinline__ int operator ()(int y, int x) const override - { - return tex2D(tex_src_object, x + xoff, y + yoff); - } - - cudaTextureObject_t tex_src_object; - }; - - template < - class T, - class Norm, - typename = typename std::enable_if::value>::type - > - __global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) + template + __global__ void calcMagnitudeKernel(cv::cudev::TextureOffPtr texSrc, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -138,8 +100,8 @@ namespace canny if (y >= mag.rows || x >= mag.cols) return; - int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); - int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); + int dxVal = (texSrc(y - 1, x + 1) + 2 * texSrc(y, x + 1) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y, x - 1) + texSrc(y + 1, x - 1)); + int dyVal = (texSrc(y + 1, x - 1) + 2 * texSrc(y + 1, x) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y - 1, x) + texSrc(y - 1, x + 1)); dx(y, x) = dxVal; dy(y, x) = dyVal; @@ -151,63 +113,20 @@ namespace canny { const dim3 block(16, 16); const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); - - bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30); - - if (cc30) + cv::cudev::TextureOff texSrc(srcWhole, yoff, xoff); + if (L2Grad) { - cudaTextureDesc texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = cudaAddressModeClamp; - texDesc.addressMode[1] = cudaAddressModeClamp; - texDesc.addressMode[2] = cudaAddressModeClamp; - - cudaTextureObject_t tex = 0; - createTextureObjectPitch2D(&tex, srcWhole, texDesc); - - SrcTexObj src(xoff, yoff, tex); - - if (L2Grad) - { - L2 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); - } - else - { - L1 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); - } - - cudaSafeCall( cudaGetLastError() ); - - if (stream == NULL) - cudaSafeCall( cudaDeviceSynchronize() ); - else - cudaSafeCall( cudaStreamSynchronize(stream) ); - - cudaSafeCall( cudaDestroyTextureObject(tex) ); + L2 norm; + calcMagnitudeKernel << > > (texSrc, dx, dy, mag, norm); } else { - bindTexture(&tex_src, srcWhole); - SrcTexRef src(xoff, yoff); - - if (L2Grad) - { - L2 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); - } - else - { - L1 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); - } - - cudaSafeCall( cudaGetLastError() ); - - if (stream == NULL) - cudaSafeCall( cudaDeviceSynchronize() ); + L1 norm; + calcMagnitudeKernel << > > (texSrc, dx, dy, mag, norm); } + + if (stream == NULL) + cudaSafeCall(cudaDeviceSynchronize()); } void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) @@ -229,8 +148,7 @@ namespace canny namespace canny { - texture tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); - __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) + __global__ void calcMapKernel(cv::cudev::TexturePtr texMag, const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) { const int CANNY_SHIFT = 15; const int TG22 = (int)(0.4142135623730950488016887242097*(1< tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) + if (m > texMag(y, x - 1) && m >= texMag(y, x + 1)) edge_type = 1 + (int)(m > high_thresh); } else if(dyVal > tg67x) { - if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) + if (m > texMag(y - 1, x) && m >= texMag(y + 1, x)) edge_type = 1 + (int)(m > high_thresh); } else { - if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) - edge_type = 1 + (int)(m > high_thresh); - } - } - - map(y, x) = edge_type; - } - - __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag) - { - const int CANNY_SHIFT = 15; - const int TG22 = (int)(0.4142135623730950488016887242097*(1<= dx.cols - 1 || y == 0 || y >= dx.rows - 1) - return; - - int dxVal = dx(y, x); - int dyVal = dy(y, x); - - const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; - const float m = tex2D(tex_mag, x, y); - - dxVal = ::abs(dxVal); - dyVal = ::abs(dyVal); - - // 0 - the pixel can not belong to an edge - // 1 - the pixel might belong to an edge - // 2 - the pixel does belong to an edge - int edge_type = 0; - - if (m > low_thresh) - { - const int tg22x = dxVal * TG22; - const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); - - dyVal <<= CANNY_SHIFT; - - if (dyVal < tg22x) - { - if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) - edge_type = 1 + (int)(m > high_thresh); - } - else if(dyVal > tg67x) - { - if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) - edge_type = 1 + (int)(m > high_thresh); - } - else - { - if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) + if (m > texMag(y - 1, x - s) && m >= texMag(y + 1, x + s)) edge_type = 1 + (int)(m > high_thresh); } } @@ -338,47 +204,10 @@ namespace canny { const dim3 block(16, 16); const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); - - if (deviceSupports(FEATURE_SET_COMPUTE_30)) - { - // Use the texture object - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = mag.ptr(); - resDesc.res.pitch2D.height = mag.rows; - resDesc.res.pitch2D.width = mag.cols; - resDesc.res.pitch2D.pitchInBytes = mag.step; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); - - cudaTextureDesc texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = cudaAddressModeClamp; - texDesc.addressMode[1] = cudaAddressModeClamp; - texDesc.addressMode[2] = cudaAddressModeClamp; - - cudaTextureObject_t tex=0; - cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); - calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh, tex); - cudaSafeCall( cudaGetLastError() ); - - if (stream == NULL) - cudaSafeCall( cudaDeviceSynchronize() ); - else - cudaSafeCall( cudaStreamSynchronize(stream) ); - - cudaSafeCall( cudaDestroyTextureObject(tex) ); - } - else - { - // Use the texture reference - bindTexture(&tex_mag, mag); - calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); - cudaSafeCall( cudaGetLastError() ); - - if (stream == NULL) - cudaSafeCall( cudaDeviceSynchronize() ); - } + cv::cudev::Texture texMag(mag); + calcMapKernel<<>>(texMag, dx, dy, map, low_thresh, high_thresh); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); } } diff --git a/modules/cudaimgproc/src/cuda/corners.cu b/modules/cudaimgproc/src/cuda/corners.cu index 92a37e6fde1..2f3452648ca 100644 --- a/modules/cudaimgproc/src/cuda/corners.cu +++ b/modules/cudaimgproc/src/cuda/corners.cu @@ -47,6 +47,7 @@ #include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/border_interpolate.hpp" +#include #include "opencv2/opencv_modules.hpp" @@ -58,10 +59,7 @@ namespace cv { namespace cuda { namespace device { /////////////////////////////////////////// Corner Harris ///////////////////////////////////////////////// - texture harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp); - texture harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp); - - __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst) + __global__ void cornerHarris_kernel(cv::cudev::TexturePtr texDx, cv::cudev::TexturePtr texDy, const int block_size, const float k, PtrStepSzf dst) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -81,8 +79,8 @@ namespace cv { namespace cuda { namespace device { for (int j = jbegin; j < jend; ++j) { - float dx = tex2D(harrisDxTex, j, i); - float dy = tex2D(harrisDyTex, j, i); + float dx = texDx(i, j); + float dy = texDy(i, j); a += dx * dx; b += dx * dy; @@ -95,7 +93,7 @@ namespace cv { namespace cuda { namespace device } template - __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col) + __global__ void cornerHarris_kernel(cv::cudev::TexturePtr texDx, cv::cudev::TexturePtr texDy, const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -119,8 +117,8 @@ namespace cv { namespace cuda { namespace device { const int x = border_row.idx_col(j); - float dx = tex2D(harrisDxTex, x, y); - float dy = tex2D(harrisDyTex, x, y); + float dx = texDx(y, x); + float dy = texDy(y, x); a += dx * dx; b += dx * dy; @@ -136,22 +134,20 @@ namespace cv { namespace cuda { namespace device { dim3 block(32, 8); dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y)); - - bindTexture(&harrisDxTex, Dx); - bindTexture(&harrisDyTex, Dy); - + cv::cudev::Texture texDx(Dx); + cv::cudev::Texture texDy(Dy); switch (border_type) { case BORDER_REFLECT101: - cornerHarris_kernel<<>>(block_size, k, dst, BrdRowReflect101(Dx.cols), BrdColReflect101(Dx.rows)); + cornerHarris_kernel<<>>(texDx, texDy, block_size, k, dst, BrdRowReflect101(Dx.cols), BrdColReflect101(Dx.rows)); break; case BORDER_REFLECT: - cornerHarris_kernel<<>>(block_size, k, dst, BrdRowReflect(Dx.cols), BrdColReflect(Dx.rows)); + cornerHarris_kernel<<>>(texDx, texDy, block_size, k, dst, BrdRowReflect(Dx.cols), BrdColReflect(Dx.rows)); break; case BORDER_REPLICATE: - cornerHarris_kernel<<>>(block_size, k, dst); + cornerHarris_kernel<<>>(texDx, texDy, block_size, k, dst); break; } @@ -163,10 +159,7 @@ namespace cv { namespace cuda { namespace device /////////////////////////////////////////// Corner Min Eigen Val ///////////////////////////////////////////////// - texture minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp); - texture minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp); - - __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst) + __global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr texMinEigenValDx, cv::cudev::TexturePtr texMinEigenValDy, const int block_size, PtrStepSzf dst) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -186,8 +179,8 @@ namespace cv { namespace cuda { namespace device { for (int j = jbegin; j < jend; ++j) { - float dx = tex2D(minEigenValDxTex, j, i); - float dy = tex2D(minEigenValDyTex, j, i); + float dx = texMinEigenValDx(i, j); + float dy = texMinEigenValDy(i, j); a += dx * dx; b += dx * dy; @@ -204,7 +197,7 @@ namespace cv { namespace cuda { namespace device template - __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col) + __global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr texMinEigenValDx, cv::cudev::TexturePtr texMinEigenValDy, const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -228,8 +221,8 @@ namespace cv { namespace cuda { namespace device { int x = border_row.idx_col(j); - float dx = tex2D(minEigenValDxTex, x, y); - float dy = tex2D(minEigenValDyTex, x, y); + float dx = texMinEigenValDx(y, x); + float dy = texMinEigenValDy(y, x); a += dx * dx; b += dx * dy; @@ -248,22 +241,20 @@ namespace cv { namespace cuda { namespace device { dim3 block(32, 8); dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y)); - - bindTexture(&minEigenValDxTex, Dx); - bindTexture(&minEigenValDyTex, Dy); - + cv::cudev::Texture texMinEigenValDx(Dx); + cv::cudev::Texture texMinEigenValDy(Dy); switch (border_type) { case BORDER_REFLECT101: - cornerMinEigenVal_kernel<<>>(block_size, dst, BrdRowReflect101(Dx.cols), BrdColReflect101(Dx.rows)); + cornerMinEigenVal_kernel<<>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect101(Dx.cols), BrdColReflect101(Dx.rows)); break; case BORDER_REFLECT: - cornerMinEigenVal_kernel<<>>(block_size, dst, BrdRowReflect(Dx.cols), BrdColReflect(Dx.rows)); + cornerMinEigenVal_kernel<<>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect(Dx.cols), BrdColReflect(Dx.rows)); break; case BORDER_REPLICATE: - cornerMinEigenVal_kernel<<>>(block_size, dst); + cornerMinEigenVal_kernel<<>>(texMinEigenValDx, texMinEigenValDy, block_size, dst); break; } diff --git a/modules/cudaimgproc/src/cuda/debayer.cu b/modules/cudaimgproc/src/cuda/debayer.cu index 0da78139807..1c4ee391421 100644 --- a/modules/cudaimgproc/src/cuda/debayer.cu +++ b/modules/cudaimgproc/src/cuda/debayer.cu @@ -48,6 +48,7 @@ #include "opencv2/core/cuda/limits.hpp" #include "opencv2/core/cuda/color.hpp" #include "opencv2/core/cuda/saturate_cast.hpp" +#include "opencv2/cudev/ptr2d/texture.hpp" namespace cv { namespace cuda { namespace device { @@ -389,10 +390,8 @@ namespace cv { namespace cuda { namespace device // // ported to CUDA - texture sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp); - - template - __global__ void MHCdemosaic(PtrStepSz dst, const int2 sourceOffset, const int2 firstRed) + template + __global__ void MHCdemosaic(PtrStepSz dst, Ptr2D src, const int2 firstRed) { const float kAx = -1.0f / 8.0f, kAy = -1.5f / 8.0f, kAz = 0.5f / 8.0f /*kAw = -1.0f / 8.0f*/; const float kBx = 2.0f / 8.0f, /*kBy = 0.0f / 8.0f,*/ /*kBz = 0.0f / 8.0f,*/ kBw = 4.0f / 8.0f ; @@ -408,8 +407,8 @@ namespace cv { namespace cuda { namespace device return; int2 center; - center.x = x + sourceOffset.x; - center.y = y + sourceOffset.y; + center.x = x; + center.y = y; int4 xCoord; xCoord.x = center.x - 2; @@ -423,25 +422,26 @@ namespace cv { namespace cuda { namespace device yCoord.z = center.y + 1; yCoord.w = center.y + 2; - float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0) + float C = src(center.y, center.x); // ( 0, 0) float4 Dvec; - Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1) - Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1) - Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1) - Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1) + Dvec.x = src(yCoord.y, xCoord.y); // (-1,-1) + Dvec.y = src(yCoord.z, xCoord.y); // (-1, 1) + Dvec.z = src(yCoord.y, xCoord.z); // ( 1,-1) + Dvec.w = src(yCoord.z, xCoord.z); // ( 1, 1) + float4 value; - value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0 - value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0 - value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0 - value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0 + value.x = src(yCoord.x, center.x); // ( 0,-2) A0 + value.y = src(yCoord.y, center.x); // ( 0,-1) B0 + value.z = src(center.y, xCoord.x); // (-2, 0) E0 + value.w = src(center.y, xCoord.y); // (-1, 0) F0 // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1) - value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1 - value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1 - value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1 - value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1 + value.x += src(yCoord.w, center.x); // ( 0, 2) A1 + value.y += src(yCoord.z, center.x); // ( 0, 1) B1 + value.z += src(center.y, xCoord.w); // ( 2, 0) E1 + value.w += src(center.y, xCoord.z); // ( 1, 0) F1 float4 PATTERN; PATTERN.x = kCx * C; @@ -527,9 +527,15 @@ namespace cv { namespace cuda { namespace device const dim3 block(32, 8); const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); - bindTexture(&sourceTex, src); + if (sourceOffset.x || sourceOffset.y) { + cv::cudev::TextureOff texSrc(src, sourceOffset.y, sourceOffset.x); + MHCdemosaic < dst_t, cv::cudev::TextureOffPtr > << > > ((PtrStepSz)dst, texSrc, firstRed); + } + else { + cv::cudev::Texture texSrc(src); + MHCdemosaic < dst_t, cv::cudev::TexturePtr > << > > ((PtrStepSz)dst, texSrc, firstRed); + } - MHCdemosaic<<>>((PtrStepSz)dst, sourceOffset, firstRed); cudaSafeCall( cudaGetLastError() ); if (stream == 0) diff --git a/modules/cudaimgproc/src/cuda/hough_segments.cu b/modules/cudaimgproc/src/cuda/hough_segments.cu index 59eb78f6996..4774636ad77 100644 --- a/modules/cudaimgproc/src/cuda/hough_segments.cu +++ b/modules/cudaimgproc/src/cuda/hough_segments.cu @@ -50,7 +50,8 @@ namespace cv { namespace cuda { namespace device { namespace hough_segments { - __global__ void houghLinesProbabilistic(cv::cudev::Texture src, const PtrStepSzi accum, + template + __global__ void houghLinesProbabilistic(Ptr2D src, const PtrStepSzi accum, int4* out, const int maxSize, const float rho, const float theta, const int lineGap, const int lineLength, @@ -219,15 +220,18 @@ namespace cv { namespace cuda { namespace device const dim3 block(32, 8); const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y)); - cv::cudev::GpuMat_ src_(mask); - cv::cudev::Texture tex(src_, false, cudaFilterModePoint, cudaAddressModeClamp); - - houghLinesProbabilistic<<>>(tex, accum, - out, maxSize, - rho, theta, - lineGap, lineLength, - mask.rows, mask.cols, - counterPtr); + Size wholeSize; + Point ofs; + mask.locateROI(wholeSize, ofs); + if (ofs.x || ofs.y) { + cv::cudev::TextureOff texMask(wholeSize.height, wholeSize.width, mask.datastart, mask.step, ofs.y, ofs.x); + houghLinesProbabilistic> << > > (texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr); + } + else { + cv::cudev::Texture texMask(mask); + houghLinesProbabilistic> << > > (texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr); + } + cudaSafeCall( cudaGetLastError() ); int totalCount; @@ -236,7 +240,6 @@ namespace cv { namespace cuda { namespace device cudaSafeCall( cudaStreamSynchronize(stream) ); totalCount = ::min(totalCount, maxSize); - return totalCount; } } diff --git a/modules/cudaimgproc/src/cuda/mean_shift.cu b/modules/cudaimgproc/src/cuda/mean_shift.cu index 3b3b93f94e4..ef7497be5c8 100644 --- a/modules/cudaimgproc/src/cuda/mean_shift.cu +++ b/modules/cudaimgproc/src/cuda/mean_shift.cu @@ -47,19 +47,16 @@ #include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/border_interpolate.hpp" +#include namespace cv { namespace cuda { namespace device { namespace imgproc { - texture tex_meanshift; - - __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, - size_t out_step, int cols, int rows, - int sp, int sr, int maxIter, float eps) + __device__ short2 do_mean_shift(cv::cudev::TexturePtr tex, int x0, int y0, unsigned char* out,size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps) { int isr2 = sr*sr; - uchar4 c = tex2D(tex_meanshift, x0, y0 ); + uchar4 c = tex(y0, x0); // iterate meanshift procedure for( int iter = 0; iter < maxIter; iter++ ) @@ -79,7 +76,7 @@ namespace cv { namespace cuda { namespace device int rowCount = 0; for( int x = minx; x <= maxx; x++ ) { - uchar4 t = tex2D( tex_meanshift, x, y ); + uchar4 t = tex(y, x); int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z); if( norm2 <= isr2 ) @@ -119,13 +116,13 @@ namespace cv { namespace cuda { namespace device return make_short2((short)x0, (short)y0); } - __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps ) + __global__ void meanshift_kernel(cv::cudev::TexturePtr tex, unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps ) { int x0 = blockIdx.x * blockDim.x + threadIdx.x; int y0 = blockIdx.y * blockDim.y + threadIdx.y; if( x0 < cols && y0 < rows ) - do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps); + do_mean_shift(tex, x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps); } void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream) @@ -134,21 +131,15 @@ namespace cv { namespace cuda { namespace device dim3 threads(32, 8, 1); grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) ); - - meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); + cv::cudev::Texture tex(src.rows, src.cols, (uchar4*)src.data, src.step); + meanshift_kernel<<< grid, threads, 0, stream >>>( tex, dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); cudaSafeCall( cudaGetLastError() ); - if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, - unsigned char* outsp, size_t outspstep, - int cols, int rows, - int sp, int sr, int maxIter, float eps) + __global__ void meanshiftproc_kernel(cv::cudev::TexturePtr tex, unsigned char* outr, size_t outrstep, unsigned char* outsp, size_t outspstep, + int cols, int rows,int sp, int sr, int maxIter, float eps) { int x0 = blockIdx.x * blockDim.x + threadIdx.x; int y0 = blockIdx.y * blockDim.y + threadIdx.y; @@ -156,7 +147,7 @@ namespace cv { namespace cuda { namespace device if( x0 < cols && y0 < rows ) { int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short); - *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps); + *(short2*)(outsp + basesp) = do_mean_shift(tex, x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps); } } @@ -166,13 +157,9 @@ namespace cv { namespace cuda { namespace device dim3 threads(32, 8, 1); grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) ); - - meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); + cv::cudev::Texture tex(src.rows, src.cols, (uchar4*)src.data, src.step); + meanshiftproc_kernel<<< grid, threads, 0, stream >>>( tex, dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); cudaSafeCall( cudaGetLastError() ); - if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } diff --git a/modules/cudaimgproc/test/test_color.cpp b/modules/cudaimgproc/test/test_color.cpp index 97be36a1210..1a8ff1fa0cb 100644 --- a/modules/cudaimgproc/test/test_color.cpp +++ b/modules/cudaimgproc/test/test_color.cpp @@ -2294,14 +2294,15 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CvtColor, testing::Combine( /////////////////////////////////////////////////////////////////////////////////////////////////////// // Demosaicing -struct Demosaicing : testing::TestWithParam +struct Demosaicing : testing::TestWithParam> { cv::cuda::DeviceInfo devInfo; + bool useRoi; virtual void SetUp() { - devInfo = GetParam(); - + devInfo = GET_PARAM(0); + useRoi = GET_PARAM(1); cv::cuda::setDevice(devInfo.deviceID()); } @@ -2419,7 +2420,7 @@ CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT) mosaic(img, src, cv::Point(1, 1)); cv::cuda::GpuMat dst; - cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerBG2BGR_MHT); + cv::cuda::demosaicing(loadMat(src,useRoi), dst, cv::cuda::COLOR_BayerBG2BGR_MHT); EXPECT_MAT_SIMILAR(img, dst, 5e-3); } @@ -2433,7 +2434,7 @@ CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT) mosaic(img, src, cv::Point(0, 1)); cv::cuda::GpuMat dst; - cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGB2BGR_MHT); + cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGB2BGR_MHT); EXPECT_MAT_SIMILAR(img, dst, 5e-3); } @@ -2447,7 +2448,7 @@ CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT) mosaic(img, src, cv::Point(0, 0)); cv::cuda::GpuMat dst; - cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerRG2BGR_MHT); + cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerRG2BGR_MHT); EXPECT_MAT_SIMILAR(img, dst, 5e-3); } @@ -2461,12 +2462,11 @@ CUDA_TEST_P(Demosaicing, BayerGR2BGR_MHT) mosaic(img, src, cv::Point(1, 0)); cv::cuda::GpuMat dst; - cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGR2BGR_MHT); - + cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGR2BGR_MHT); EXPECT_MAT_SIMILAR(img, dst, 5e-3); } -INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, ALL_DEVICES); +INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, testing::Combine(ALL_DEVICES, WHOLE_SUBMAT)); /////////////////////////////////////////////////////////////////////////////////////////////////////// // swapChannels diff --git a/modules/cudaimgproc/test/test_hough.cpp b/modules/cudaimgproc/test/test_hough.cpp index e6a05f578f6..023e1c50c7d 100644 --- a/modules/cudaimgproc/test/test_hough.cpp +++ b/modules/cudaimgproc/test/test_hough.cpp @@ -115,8 +115,20 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLines, testing::Combine( /////////////////////////////////////////////////////////////////////////////////////////////////////// // HoughLines Probabilistic -PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi) +PARAM_TEST_CASE(HoughLinesProbabilistic, DeviceInfo, Size, UseRoi) { + cv::cuda::DeviceInfo devInfo; + bool useRoi; + Size size; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + useRoi = GET_PARAM(2); + cv::cuda::setDevice(devInfo.deviceID()); + } + static void generateLines(cv::Mat& img) { img.setTo(cv::Scalar::all(0)); @@ -140,11 +152,6 @@ PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi) CUDA_TEST_P(HoughLinesProbabilistic, Accuracy) { - const cv::cuda::DeviceInfo devInfo = GET_PARAM(0); - cv::cuda::setDevice(devInfo.deviceID()); - const cv::Size size = GET_PARAM(1); - const bool useRoi = GET_PARAM(2); - const float rho = 1.0f; const float theta = (float) (1.0 * CV_PI / 180.0); const int minLineLength = 15; @@ -169,12 +176,55 @@ CUDA_TEST_P(HoughLinesProbabilistic, Accuracy) } +void HoughLinesProbabilisticThread(const Ptr detector, const GpuMat& imgIn, const std::vector& linesOut, Stream& stream) { + for (auto& lines : linesOut) + detector->detect(imgIn, lines, stream); + stream.waitForCompletion(); +} + +CUDA_TEST_P(HoughLinesProbabilistic, Async) +{ + constexpr int nThreads = 5; + constexpr int nIters = 5; + vector streams(nThreads); // async test only + vector imgsIn; + vector> detectors; + vector> linesOut(nThreads); + const float rho = 1.0f; + const float theta = (float)(1.0 * CV_PI / 180.0); + const int minLineLength = 15; + const int maxLineGap = 8; + + cv::Mat src(size, CV_8UC1); + generateLines(src); + + for (int i = 0; i < nThreads; i++) { + imgsIn.push_back(loadMat(src, useRoi)); + detectors.push_back(createHoughSegmentDetector(rho, theta, minLineLength, maxLineGap)); + linesOut.push_back(vector(nIters)); + } + + vector thread(nThreads); + for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(HoughLinesProbabilisticThread, detectors.at(i), std::ref(imgsIn.at(i)), std::ref(linesOut.at(i)), std::ref(streams.at(i))); + for (int i = 0; i < nThreads; i++) thread.at(i).join(); + + for (int i = 0; i < nThreads; i++) { + std::vector linesSegment; + std::vector lines; + for (const auto& line : linesOut.at(i)) { + line.download(linesSegment); + cv::Mat dst(size, CV_8UC1); + drawLines(dst, linesSegment); + ASSERT_MAT_NEAR(src, dst, 0.0); + } + } +} + INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLinesProbabilistic, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, WHOLE_SUBMAT)); - /////////////////////////////////////////////////////////////////////////////////////////////////////// // HoughCircles diff --git a/modules/cudaimgproc/test/test_precomp.hpp b/modules/cudaimgproc/test/test_precomp.hpp index dd94f6f2856..e388fbdaa8a 100644 --- a/modules/cudaimgproc/test/test_precomp.hpp +++ b/modules/cudaimgproc/test/test_precomp.hpp @@ -49,4 +49,6 @@ #include "cvconfig.h" +#include + #endif diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp index d0ec6a42d6e..f03410dfbd0 100644 --- a/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp +++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp @@ -119,9 +119,9 @@ typedef bool NcvBool; typedef long long Ncv64s; #if defined(__APPLE__) && !defined(__CUDACC__) - typedef uint64_t Ncv64u; + typedef uint64 Ncv64u; #else - typedef unsigned long long Ncv64u; + typedef uint64 Ncv64u; #endif typedef int Ncv32s; diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp index 89e7f7cdea3..d9189eb20bb 100644 --- a/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp +++ b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp @@ -174,7 +174,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState); * \return NCV status code */ CV_EXPORTS -NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, +NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc, NcvSize32u srcSize, Ncv32u nSrcStep, Ncv32f *pDst, @@ -182,7 +182,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, Ncv32u nDstStep, NcvRect32u oROI, NppStBorderType borderType, - const Ncv32f *pKernel, + Ncv32f *pKernel, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier); @@ -208,7 +208,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, * \return NCV status code */ CV_EXPORTS -NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc, +NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc, NcvSize32u srcSize, Ncv32u nSrcStep, Ncv32f *pDst, @@ -216,7 +216,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc, Ncv32u nDstStep, NcvRect32u oROI, NppStBorderType borderType, - const Ncv32f *pKernel, + Ncv32f *pKernel, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier); @@ -319,7 +319,7 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc, * \return NCV status code */ CV_EXPORTS -NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc, +NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc, NcvSize32u srcSize, Ncv32u nSrcStep, NcvRect32u srcROI, diff --git a/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu index 01914880248..a7f83c715d0 100644 --- a/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu +++ b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu @@ -65,9 +65,12 @@ #include "opencv2/cudalegacy/NPP_staging.hpp" #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp" +#include typedef NCVVectorAlloc FloatVector; +typedef cv::cudev::TexturePtr Ptr2D; +typedef cv::cudev::Texture Texture; ///////////////////////////////////////////////////////////////////////////////////////// // Implementation specific constants @@ -84,39 +87,6 @@ inline int iDivUp(int a, int b) return (a + b - 1)/b; } -///////////////////////////////////////////////////////////////////////////////////////// -// Texture references -///////////////////////////////////////////////////////////////////////////////////////// - -texture tex_coarse; -texture tex_fine; - -texture tex_I1; -texture tex_I0; - -texture tex_Ix; -texture tex_Ixx; -texture tex_Ix0; - -texture tex_Iy; -texture tex_Iyy; -texture tex_Iy0; - -texture tex_Ixy; - -texture tex_u; -texture tex_v; -texture tex_du; -texture tex_dv; -texture tex_numerator_dudv; -texture tex_numerator_u; -texture tex_numerator_v; -texture tex_inv_denominator_u; -texture tex_inv_denominator_v; -texture tex_diffusivity_x; -texture tex_diffusivity_y; - - ///////////////////////////////////////////////////////////////////////////////////////// // SUPPLEMENTARY FUNCTIONS ///////////////////////////////////////////////////////////////////////////////////////// @@ -265,8 +235,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo ///\param h number of rows in global memory array ///\param p global memory array pitch in floats /////////////////////////////////////////////////////////////////////////////// -template -__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p) +__forceinline__ __device__ void load_array_element(Ptr2D texSrc, float *smem, int is, int js, int i, int j, int w, int h, int p) { //position within shared memory array const int ijs = js * PSOR_PITCH + is; @@ -276,20 +245,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js, j = max(j, -j-1); j = min(j, h-j+h-1); const int pos = j * p + i; - switch(tex_id){ - case 0: - smem[ijs] = tex1Dfetch(tex_u, pos); - break; - case 1: - smem[ijs] = tex1Dfetch(tex_v, pos); - break; - case 2: - smem[ijs] = tex1Dfetch(tex_du, pos); - break; - case 3: - smem[ijs] = tex1Dfetch(tex_dv, pos); - break; - } + smem[ijs] = texSrc(pos); } /////////////////////////////////////////////////////////////////////////////// @@ -301,49 +257,48 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js, ///\param h number of rows in global memory array ///\param p global memory array pitch in floats /////////////////////////////////////////////////////////////////////////////// -template -__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p) +__forceinline__ __device__ void load_array(Ptr2D texSrc, float *smem, int ig, int jg, int w, int h, int p) { const int i = threadIdx.x + 2; const int j = threadIdx.y + 2; - load_array_element(smem, i, j, ig, jg, w, h, p);//load current pixel + load_array_element(texSrc, smem, i, j, ig, jg, w, h, p);//load current pixel __syncthreads(); if(threadIdx.y < 2) { //load bottom shadow elements - load_array_element(smem, i, j-2, ig, jg-2, w, h, p); + load_array_element(texSrc, smem, i, j-2, ig, jg-2, w, h, p); if(threadIdx.x < 2) { //load bottom right shadow elements - load_array_element(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p); + load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p); //load middle right shadow elements - load_array_element(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); + load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); } else if(threadIdx.x >= PSOR_TILE_WIDTH-2) { //load bottom left shadow elements - load_array_element(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p); + load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p); //load middle left shadow elements - load_array_element(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); + load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); } } else if(threadIdx.y >= PSOR_TILE_HEIGHT-2) { //load upper shadow elements - load_array_element(smem, i, j+2, ig, jg+2, w, h, p); + load_array_element(texSrc, smem, i, j+2, ig, jg+2, w, h, p); if(threadIdx.x < 2) { //load upper right shadow elements - load_array_element(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p); + load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p); //load middle right shadow elements - load_array_element(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); + load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); } else if(threadIdx.x >= PSOR_TILE_WIDTH-2) { //load upper left shadow elements - load_array_element(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p); + load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p); //load middle left shadow elements - load_array_element(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); + load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); } } else @@ -352,12 +307,12 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i if(threadIdx.x < 2) { //load middle right shadow elements - load_array_element(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); + load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); } else if(threadIdx.x >= PSOR_TILE_WIDTH-2) { //load middle left shadow elements - load_array_element(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); + load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); } } __syncthreads(); @@ -382,13 +337,9 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i /// \param alpha (in) alpha in Brox model (flow smoothness) /// \param gamma (in) gamma in Brox model (edge importance) /////////////////////////////////////////////////////////////////////////////// - -__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, - float *denominator_u, float *denominator_v, - float *numerator_dudv, - float *numerator_u, float *numerator_v, - int w, int h, int s, - float alpha, float gamma) +__global__ void prepare_sor_stage_1_tex(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texI0, Ptr2D texI1, Ptr2D texIx, Ptr2D texIxx, Ptr2D texIx0, Ptr2D texIy, Ptr2D texIyy, + Ptr2D texIy0, Ptr2D texIxy, float *diffusivity_x, float *diffusivity_y, float *denominator_u, float *denominator_v, float *numerator_dudv, float *numerator_u, float *numerator_v, + int w, int h, int s, float alpha, float gamma) { __shared__ float u[PSOR_PITCH * PSOR_HEIGHT]; __shared__ float v[PSOR_PITCH * PSOR_HEIGHT]; @@ -408,24 +359,24 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity float x = (float)ig + 0.5f; float y = (float)jg + 0.5f; //load u and v to smem - load_array<0>(u, ig, jg, w, h, s); - load_array<1>(v, ig, jg, w, h, s); - load_array<2>(du, ig, jg, w, h, s); - load_array<3>(dv, ig, jg, w, h, s); + load_array(texU, u, ig, jg, w, h, s); + load_array(texV, v, ig, jg, w, h, s); + load_array(texDu, du, ig, jg, w, h, s); + load_array(texDv, dv, ig, jg, w, h, s); //warped position float wx = (x + u[ijs])/(float)w; float wy = (y + v[ijs])/(float)h; x /= (float)w; y /= (float)h; //compute image derivatives - const float Iz = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y); - const float Ix = tex2D(tex_Ix, wx, wy); - const float Ixz = Ix - tex2D(tex_Ix0, x, y); - const float Ixy = tex2D(tex_Ixy, wx, wy); - const float Ixx = tex2D(tex_Ixx, wx, wy); - const float Iy = tex2D(tex_Iy, wx, wy); - const float Iyz = Iy - tex2D(tex_Iy0, x, y); - const float Iyy = tex2D(tex_Iyy, wx, wy); + const float Iz = texI1(wy, wx) - texI0(y,x); + const float Ix = texIx(wy, wx); + const float Ixz = Ix - texIx0(y, x); + const float Ixy = texIxy(wy, wx); + const float Ixx = texIxx(wy, wx); + const float Iy = texIy(wy, wx); + const float Iyz = Iy - texIy0(y, x); + const float Iyy = texIyy(wy, wx); //compute data term float q0, q1, q2; q0 = Iz + Ix * du[ijs] + Iy * dv[ijs]; @@ -462,8 +413,7 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity ///\param h ///\param s /////////////////////////////////////////////////////////////////////////////// -__global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v, - int w, int h, int s) +__global__ void prepare_sor_stage_2(Ptr2D texDiffX, Ptr2D texDiffY, float *inv_denominator_u, float *inv_denominator_v, int w, int h, int s) { __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)]; __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)]; @@ -486,8 +436,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin } if(inside) { - sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg); - sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg); + sx[ijs] = texDiffX(ijg); + sy[ijs] = texDiffY(ijg); } else { @@ -498,25 +448,17 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin if(j == PSOR_TILE_HEIGHT-1) { if(jg < h-1 && inside) - { - sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s); - } + sy[up] = texDiffY(ijg + s); else - { sy[up] = 0.0f; - } } int right = ijs + 1; if(threadIdx.x == PSOR_TILE_WIDTH-1) { if(ig < w-1 && inside) - { - sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1); - } + sx[right] = texDiffX(ijg + 1); else - { sx[right] = 0.0f; - } } __syncthreads(); float diffusivity_sum; @@ -534,17 +476,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin // Red-Black SOR ///////////////////////////////////////////////////////////////////////////////////////// -template __global__ void sor_pass(float *new_du, - float *new_dv, - const float *g_inv_denominator_u, - const float *g_inv_denominator_v, - const float *g_numerator_u, - const float *g_numerator_v, - const float *g_numerator_dudv, - float omega, - int width, - int height, - int stride) +template __global__ void sor_pass(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texDiffX, Ptr2D texDiffY, float *new_du, float *new_dv, const float *g_inv_denominator_u, + const float *g_inv_denominator_v, const float *g_numerator_u, const float *g_numerator_v, const float *g_numerator_dudv, float omega, int width, int height, int stride) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; @@ -560,14 +493,14 @@ template __global__ void sor_pass(float *new_du, //load smooth term float s_up, s_left, s_right, s_down; - s_left = tex1Dfetch(tex_diffusivity_x, pos); - s_down = tex1Dfetch(tex_diffusivity_y, pos); + s_left = texDiffX(pos); + s_down = texDiffY(pos); if(i < width-1) - s_right = tex1Dfetch(tex_diffusivity_x, pos_r); + s_right = texDiffX(pos_r); else s_right = 0.0f; //Neumann BC if(j < height-1) - s_up = tex1Dfetch(tex_diffusivity_y, pos_u); + s_up = texDiffY(pos_u); else s_up = 0.0f; //Neumann BC @@ -577,30 +510,29 @@ template __global__ void sor_pass(float *new_du, float du_up, du_left, du_right, du_down, du; float dv_up, dv_left, dv_right, dv_down, dv; - u_left = tex1Dfetch(tex_u, pos_l); - u_right = tex1Dfetch(tex_u, pos_r); - u_down = tex1Dfetch(tex_u, pos_d); - u_up = tex1Dfetch(tex_u, pos_u); - u = tex1Dfetch(tex_u, pos); - - v_left = tex1Dfetch(tex_v, pos_l); - v_right = tex1Dfetch(tex_v, pos_r); - v_down = tex1Dfetch(tex_v, pos_d); - v = tex1Dfetch(tex_v, pos); - v_up = tex1Dfetch(tex_v, pos_u); - - du = tex1Dfetch(tex_du, pos); - du_left = tex1Dfetch(tex_du, pos_l); - du_right = tex1Dfetch(tex_du, pos_r); - du_down = tex1Dfetch(tex_du, pos_d); - du_up = tex1Dfetch(tex_du, pos_u); - - dv = tex1Dfetch(tex_dv, pos); - dv_left = tex1Dfetch(tex_dv, pos_l); - dv_right = tex1Dfetch(tex_dv, pos_r); - dv_down = tex1Dfetch(tex_dv, pos_d); - dv_up = tex1Dfetch(tex_dv, pos_u); - + u_left = texU(pos_l); + u_right = texU(pos_r); + u_down = texU(pos_d); + u_up = texU(pos_u); + u = texU(pos); + + v_left = texV(pos_l); + v_right = texV(pos_r); + v_down = texV(pos_d); + v = texV(pos); + v_up = texV(pos_u); + + du = texDu(pos); + du_left = texDu(pos_l); + du_right = texDu(pos_r); + du_down = texDu(pos_d); + du_up = texDu(pos_u); + + dv = texDv(pos); + dv_left = texDv(pos_l); + dv_right = texDv(pos_r); + dv_down = texDv(pos_d); + dv_up = texDv(pos_u); float numerator_dudv = g_numerator_dudv[pos]; if((i+j)%2 == isBlack) @@ -624,52 +556,6 @@ template __global__ void sor_pass(float *new_du, /////////////////////////////////////////////////////////////////////////////// // utility functions /////////////////////////////////////////////////////////////////////////////// - -void initTexture1D(texture &tex) -{ - tex.addressMode[0] = cudaAddressModeClamp; - tex.filterMode = cudaFilterModePoint; - tex.normalized = false; -} - -void initTexture2D(texture &tex) -{ - tex.addressMode[0] = cudaAddressModeMirror; - tex.addressMode[1] = cudaAddressModeMirror; - tex.filterMode = cudaFilterModeLinear; - tex.normalized = true; -} - -void InitTextures() -{ - initTexture2D(tex_I0); - initTexture2D(tex_I1); - initTexture2D(tex_fine); // for downsampling - initTexture2D(tex_coarse); // for prolongation - - initTexture2D(tex_Ix); - initTexture2D(tex_Ixx); - initTexture2D(tex_Ix0); - - initTexture2D(tex_Iy); - initTexture2D(tex_Iyy); - initTexture2D(tex_Iy0); - - initTexture2D(tex_Ixy); - - initTexture1D(tex_u); - initTexture1D(tex_v); - initTexture1D(tex_du); - initTexture1D(tex_dv); - initTexture1D(tex_diffusivity_x); - initTexture1D(tex_diffusivity_y); - initTexture1D(tex_inv_denominator_u); - initTexture1D(tex_inv_denominator_v); - initTexture1D(tex_numerator_dudv); - initTexture1D(tex_numerator_u); - initTexture1D(tex_numerator_v); -} - namespace { struct ImagePyramid @@ -804,8 +690,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, cudaMemcpyHostToDevice), NCV_CUDA_ERROR); - - InitTextures(); } //prepare image pyramid @@ -909,9 +793,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR); //select images with lowest resolution - size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float); - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); FloatVector* ptrU = &u; @@ -941,17 +822,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR); - //texture format descriptor - cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc(); - I0 = *img0Iter; I1 = *img1Iter; ++img0Iter; ++img1Iter; - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR); + Texture texI0(kLevelHeight, kLevelWidth, I0->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror); + Texture texI1(kLevelHeight, kLevelWidth, I1->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror); //compute derivatives dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6)); @@ -991,20 +869,24 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI, nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy, Iy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); + Texture texIx(kLevelHeight, kLevelWidth, Ix.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror); + Texture texIxx(kLevelHeight, kLevelWidth, Ixx.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror); + Texture texIx0(kLevelHeight, kLevelWidth, Ix0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror); + Texture texIy(kLevelHeight, kLevelWidth, Iy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror); + Texture texIyy(kLevelHeight, kLevelWidth, Iyy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror); + Texture texIy0(kLevelHeight, kLevelWidth, Iy0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror); + Texture texIxy(kLevelHeight, kLevelWidth, Ixy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror); + Texture texDiffX(1, kLevelSizeInBytes / sizeof(float), diffusivity_x.ptr(), kLevelSizeInBytes); + Texture texDiffY(1, kLevelSizeInBytes / sizeof(float), diffusivity_y.ptr(), kLevelSizeInBytes); // flow - ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + Texture texU(1, kLevelSizeInBytes / sizeof(float), ptrU->ptr(), kLevelSizeInBytes); + Texture texV(1, kLevelSizeInBytes / sizeof(float), ptrV->ptr(), kLevelSizeInBytes); // flow increments - ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + Texture texDu(1, kLevelSizeInBytes / sizeof(float), du.ptr(), kLevelSizeInBytes); + Texture texDv(1, kLevelSizeInBytes / sizeof(float), dv.ptr(), kLevelSizeInBytes); + Texture texDuNew(1, kLevelSizeInBytes / sizeof(float), du_new.ptr(), kLevelSizeInBytes); + Texture texDvNew(1, kLevelSizeInBytes / sizeof(float), dv_new.ptr(), kLevelSizeInBytes); dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT)); dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT); @@ -1018,89 +900,30 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration) { //compute coefficients - prepare_sor_stage_1_tex<<>> - (diffusivity_x.ptr(), - diffusivity_y.ptr(), - denom_u.ptr(), - denom_v.ptr(), - num_dudv.ptr(), - num_u.ptr(), - num_v.ptr(), - kLevelWidth, - kLevelHeight, - kLevelStride, - alpha, - gamma); + prepare_sor_stage_1_tex<<>> (texU, texV, texDu, texDv, texI0, texI1, texIx, texIxx, texIx0, texIy, texIyy, texIy0, texIxy, + diffusivity_x.ptr(), diffusivity_y.ptr(), denom_u.ptr(), denom_v.ptr(), num_dudv.ptr(), num_u.ptr(), num_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride, alpha, gamma); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - - ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - - ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - - prepare_sor_stage_2<<>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride); + prepare_sor_stage_2<<>>(texDiffX, texDiffY, denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); - // linear system coefficients - ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - - ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - - ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - - ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); //solve linear system for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration) { float omega = 1.99f; - - ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - - sor_pass<0><<>> - (du_new.ptr(), - dv_new.ptr(), - denom_u.ptr(), - denom_v.ptr(), - num_u.ptr(), - num_v.ptr(), - num_dudv.ptr(), - omega, - kLevelWidth, - kLevelHeight, - kLevelStride); + sor_pass<0><<>>(texU, texV, texDu, texDv, texDiffX, texDiffY, du_new.ptr(), dv_new.ptr(), denom_u.ptr(), denom_v.ptr(), + num_u.ptr(), num_v.ptr(), num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - - sor_pass<1><<>> - (du.ptr(), - dv.ptr(), - denom_u.ptr(), - denom_v.ptr(), - num_u.ptr(), - num_v.ptr(), - num_dudv.ptr(), - omega, - kLevelWidth, - kLevelHeight, - kLevelStride); - ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); + sor_pass<1><<>>(texU, texV, texDuNew, texDvNew, texDiffX, texDiffY, du.ptr(), dv.ptr(), denom_u.ptr(), denom_v.ptr(), num_u.ptr(), + num_v.ptr(),num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); }//end of solver loop }// end of inner loop diff --git a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu index 57506173f50..9760bcee523 100644 --- a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu +++ b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu @@ -72,6 +72,7 @@ #include "opencv2/cudalegacy/NCV.hpp" #include "opencv2/cudalegacy/NPP_staging.hpp" #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp" +#include #include "NCVRuntimeTemplates.hpp" #include "NCVAlg.hpp" @@ -94,24 +95,6 @@ const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64; #define NUM_THREADS_CLASSIFIERPARALLEL (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2) -/** \internal -* Haar features solid array. -*/ -texture texHaarFeatures; - - -/** \internal -* Haar classifiers flattened trees container. -* Two parts: first contains root nodes, second - nodes that are referred by root nodes. -* Drawback: breaks tree locality (might cause more cache misses -* Advantage: No need to introduce additional 32-bit field to index root nodes offsets -*/ -texture texHaarClassifierNodes; - - -texture texIImage; - - __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages) { return d_Stages[iStage]; @@ -119,51 +102,37 @@ __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages) template -__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes) +__device__ HaarClassifierNode128 getClassifierNode(cv::cudev::TexturePtr texHaarClassifierNodes, Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes) { HaarClassifierNode128 tmpNode; if (tbCacheTextureCascade) - { - tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode); - } + tmpNode._ui4 = texHaarClassifierNodes(iNode); else - { tmpNode = d_ClassifierNodes[iNode]; - } return tmpNode; } template -__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features, - Ncv32f *weight, - Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight) +__device__ void getFeature(cv::cudev::TexturePtr texHaarFeatures, Ncv32u iFeature, HaarFeature64* d_Features, Ncv32f* weight, Ncv32u* rectX, Ncv32u* rectY, Ncv32u* rectWidth, Ncv32u* rectHeight) { HaarFeature64 feature; if (tbCacheTextureCascade) - { - feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature); - } + feature._ui2 = texHaarFeatures(iFeature); else - { feature = d_Features[iFeature]; - } feature.getRect(rectX, rectY, rectWidth, rectHeight); *weight = feature.getWeight(); } template -__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg) +__device__ Ncv32u getElemIImg(cv::cudev::TexturePtr texImg, Ncv32u x, Ncv32u *d_IImg) { if (tbCacheTextureIImg) - { - return tex1Dfetch(texIImage, x); - } + return texImg(x); else - { return d_IImg[x]; - } } @@ -203,17 +172,10 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u } -template -__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride, - Ncv32f *d_weights, Ncv32u weightsStride, - HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, - Ncv32u *d_inMask, Ncv32u *d_outMask, - Ncv32u mask1Dlen, Ncv32u mask2Dstride, - NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea) +template +__global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr texImg, cv::cudev::TexturePtr texHaarFeatures, cv::cudev::TexturePtr texHaarClassifierNodes, + Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, + Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea) { Ncv32u y_offs; Ncv32u x_offs; @@ -299,7 +261,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr { while (bMoreNodesToTraverse) { - HaarClassifierNode128 curNode = getClassifierNode(iNode, d_ClassifierNodes); + HaarClassifierNode128 curNode = getClassifierNode(texHaarClassifierNodes, iNode, d_ClassifierNodes); HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc(); Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures(); Ncv32u iFeature = featuresDesc.getFeaturesOffset(); @@ -310,19 +272,17 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr { Ncv32f rectWeight; Ncv32u rectX, rectY, rectWidth, rectHeight; - getFeature - (iFeature + iRect, d_Features, - &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight); + getFeature (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight); Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX); Ncv32u iioffsTR = iioffsTL + rectWidth; Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride; Ncv32u iioffsBR = iioffsBL + rectWidth; - Ncv32u rectSum = getElemIImg(iioffsBR, d_IImg) - - getElemIImg(iioffsBL, d_IImg) + - getElemIImg(iioffsTL, d_IImg) - - getElemIImg(iioffsTR, d_IImg); + Ncv32u rectSum = getElemIImg(texImg, iioffsBR, d_IImg) - + getElemIImg(texImg, iioffsBL, d_IImg) + + getElemIImg(texImg, iioffsTL, d_IImg) - + getElemIImg(texImg, iioffsTR, d_IImg); #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); @@ -393,15 +353,10 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr } -template -__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride, - Ncv32f *d_weights, Ncv32u weightsStride, - HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, - Ncv32u *d_inMask, Ncv32u *d_outMask, - Ncv32u mask1Dlen, Ncv32u mask2Dstride, - NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea) +template +__global__ void applyHaarClassifierClassifierParallel(cv::cudev::TexturePtr texImg, cv::cudev::TexturePtr texHaarFeatures, cv::cudev::TexturePtr texHaarClassifierNodes, Ncv32u *d_IImg, + Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask, + Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea) { Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x; @@ -439,7 +394,7 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm while (bMoreNodesToTraverse) { - HaarClassifierNode128 curNode = getClassifierNode(iNode, d_ClassifierNodes); + HaarClassifierNode128 curNode = getClassifierNode(texHaarClassifierNodes, iNode, d_ClassifierNodes); HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc(); Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures(); Ncv32u iFeature = featuresDesc.getFeaturesOffset(); @@ -450,19 +405,17 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm { Ncv32f rectWeight; Ncv32u rectX, rectY, rectWidth, rectHeight; - getFeature - (iFeature + iRect, d_Features, - &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight); + getFeature (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight); Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX); Ncv32u iioffsTR = iioffsTL + rectWidth; Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride; Ncv32u iioffsBR = iioffsBL + rectWidth; - Ncv32u rectSum = getElemIImg(iioffsBR, d_IImg) - - getElemIImg(iioffsBL, d_IImg) + - getElemIImg(iioffsTL, d_IImg) - - getElemIImg(iioffsTR, d_IImg); + Ncv32u rectSum = getElemIImg(texImg, iioffsBR, d_IImg) - + getElemIImg(texImg, iioffsBL, d_IImg) + + getElemIImg(texImg, iioffsTL, d_IImg) - + getElemIImg(texImg, iioffsTR, d_IImg); #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); @@ -578,8 +531,9 @@ struct applyHaarClassifierAnchorParallelFunctor { dim3 gridConf, blockConf; cudaStream_t cuStream; - - //Kernel arguments are stored as members; + cv::cudev::TexturePtr texImg; + cv::cudev::TexturePtr texHaarFeatures; + cv::cudev::TexturePtr texHaarClassifierNodes; Ncv32u *d_IImg; Ncv32u IImgStride; Ncv32f *d_weights; @@ -597,32 +551,12 @@ struct applyHaarClassifierAnchorParallelFunctor Ncv32f scaleArea; //Arguments are passed through the constructor - applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, - Ncv32u *_d_IImg, Ncv32u _IImgStride, - Ncv32f *_d_weights, Ncv32u _weightsStride, - HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, - Ncv32u *_d_inMask, Ncv32u *_d_outMask, - Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, - NcvSize32u _anchorsRoi, Ncv32u _startStageInc, - Ncv32u _endStageExc, Ncv32f _scaleArea) : - gridConf(_gridConf), - blockConf(_blockConf), - cuStream(_cuStream), - d_IImg(_d_IImg), - IImgStride(_IImgStride), - d_weights(_d_weights), - weightsStride(_weightsStride), - d_Features(_d_Features), - d_ClassifierNodes(_d_ClassifierNodes), - d_Stages(_d_Stages), - d_inMask(_d_inMask), - d_outMask(_d_outMask), - mask1Dlen(_mask1Dlen), - mask2Dstride(_mask2Dstride), - anchorsRoi(_anchorsRoi), - startStageInc(_startStageInc), - endStageExc(_endStageExc), - scaleArea(_scaleArea) + applyHaarClassifierAnchorParallelFunctor(cv::cudev::TexturePtr texImg_, cv::cudev::TexturePtr texHaarFeatures_, cv::cudev::TexturePtr texHaarClassifierNodes_, dim3 _gridConf, + dim3 _blockConf, cudaStream_t _cuStream, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, + HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi, Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) : + gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_), texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride), + d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features), d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen), + mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi), startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea) {} template @@ -635,43 +569,19 @@ struct applyHaarClassifierAnchorParallelFunctor Loki::TL::TypeAt::Result::value, Loki::TL::TypeAt::Result::value, Loki::TL::TypeAt::Result::value > - <<>> - (d_IImg, IImgStride, - d_weights, weightsStride, - d_Features, d_ClassifierNodes, d_Stages, - d_inMask, d_outMask, - mask1Dlen, mask2Dstride, - anchorsRoi, startStageInc, - endStageExc, scaleArea); + <<>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask, + d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea); } }; -void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, - NcvBool tbCacheTextureIImg, - NcvBool tbCacheTextureCascade, - NcvBool tbReadPixelIndexFromVector, - NcvBool tbDoAtomicCompaction, - - dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, - - Ncv32u *d_IImg, Ncv32u IImgStride, - Ncv32f *d_weights, Ncv32u weightsStride, - HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, - Ncv32u *d_inMask, Ncv32u *d_outMask, - Ncv32u mask1Dlen, Ncv32u mask2Dstride, - NcvSize32u anchorsRoi, Ncv32u startStageInc, - Ncv32u endStageExc, Ncv32f scaleArea) +void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction, + dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, cv::cudev::TexturePtr texImg, cv::cudev::TexturePtr texHaarFeatures, cv::cudev::TexturePtr texHaarClassifierNodes, Ncv32u *d_IImg, + Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask, + Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea) { - - applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream, - d_IImg, IImgStride, - d_weights, weightsStride, - d_Features, d_ClassifierNodes, d_Stages, - d_inMask, d_outMask, - mask1Dlen, mask2Dstride, - anchorsRoi, startStageInc, - endStageExc, scaleArea); + applyHaarClassifierAnchorParallelFunctor functor(texImg, texHaarFeatures, texHaarClassifierNodes, gridConf, blockConf, cuStream, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, + d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea); //Second parameter is the number of "dynamic" template parameters NCVRuntimeTemplateBool::KernelCaller @@ -688,8 +598,9 @@ struct applyHaarClassifierClassifierParallelFunctor { dim3 gridConf, blockConf; cudaStream_t cuStream; - - //Kernel arguments are stored as members; + cv::cudev::TexturePtr texImg; + cv::cudev::TexturePtr texHaarFeatures; + cv::cudev::TexturePtr texHaarClassifierNodes; Ncv32u *d_IImg; Ncv32u IImgStride; Ncv32f *d_weights; @@ -707,32 +618,13 @@ struct applyHaarClassifierClassifierParallelFunctor Ncv32f scaleArea; //Arguments are passed through the constructor - applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, - Ncv32u *_d_IImg, Ncv32u _IImgStride, - Ncv32f *_d_weights, Ncv32u _weightsStride, - HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, - Ncv32u *_d_inMask, Ncv32u *_d_outMask, - Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, - NcvSize32u _anchorsRoi, Ncv32u _startStageInc, - Ncv32u _endStageExc, Ncv32f _scaleArea) : - gridConf(_gridConf), - blockConf(_blockConf), - cuStream(_cuStream), - d_IImg(_d_IImg), - IImgStride(_IImgStride), - d_weights(_d_weights), - weightsStride(_weightsStride), - d_Features(_d_Features), - d_ClassifierNodes(_d_ClassifierNodes), - d_Stages(_d_Stages), - d_inMask(_d_inMask), - d_outMask(_d_outMask), - mask1Dlen(_mask1Dlen), - mask2Dstride(_mask2Dstride), - anchorsRoi(_anchorsRoi), - startStageInc(_startStageInc), - endStageExc(_endStageExc), - scaleArea(_scaleArea) + applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, cv::cudev::TexturePtr texImg_, cv::cudev::TexturePtr texHaarFeatures_, + cv::cudev::TexturePtr texHaarClassifierNodes_, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features, + HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi, + Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) : gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_), + texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride), d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features), + d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen), mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi), + startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea) {} template @@ -743,40 +635,19 @@ struct applyHaarClassifierClassifierParallelFunctor Loki::TL::TypeAt::Result::value, Loki::TL::TypeAt::Result::value, Loki::TL::TypeAt::Result::value > - <<>> - (d_IImg, IImgStride, - d_weights, weightsStride, - d_Features, d_ClassifierNodes, d_Stages, - d_inMask, d_outMask, - mask1Dlen, mask2Dstride, - anchorsRoi, startStageInc, - endStageExc, scaleArea); + <<>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask, + d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea); } }; -void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg, - NcvBool tbCacheTextureCascade, - NcvBool tbDoAtomicCompaction, - - dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, - - Ncv32u *d_IImg, Ncv32u IImgStride, - Ncv32f *d_weights, Ncv32u weightsStride, - HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, - Ncv32u *d_inMask, Ncv32u *d_outMask, - Ncv32u mask1Dlen, Ncv32u mask2Dstride, - NcvSize32u anchorsRoi, Ncv32u startStageInc, - Ncv32u endStageExc, Ncv32f scaleArea) +void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction, dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, + cv::cudev::TexturePtr texImg, cv::cudev::TexturePtr texHaarFeatures, cv::cudev::TexturePtr texHaarClassifierNodes, Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights, + Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride, + NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea) { - applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream, - d_IImg, IImgStride, - d_weights, weightsStride, - d_Features, d_ClassifierNodes, d_Stages, - d_inMask, d_outMask, - mask1Dlen, mask2Dstride, - anchorsRoi, startStageInc, - endStageExc, scaleArea); + applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, + d_ClassifierNodes, d_Stages, d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea); //Second parameter is the number of "dynamic" template parameters NCVRuntimeTemplateBool::KernelCaller @@ -1015,31 +886,15 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &integral, NCV_SKIP_COND_BEGIN + cv::cudev::Texture texImg; if (bTexCacheIImg) - { - cudaChannelFormatDesc cfdTexIImage; - cfdTexIImage = cudaCreateChannelDesc(); + texImg = cv::cudev::Texture((anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch(), integral.ptr()); - size_t alignmentOffset; - ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage, - (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR); - ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR); - } - - if (bTexCacheCascade) - { - cudaChannelFormatDesc cfdTexHaarFeatures; - cudaChannelFormatDesc cfdTexHaarClassifierNodes; - cfdTexHaarFeatures = cudaCreateChannelDesc(); - cfdTexHaarClassifierNodes = cudaCreateChannelDesc(); - - size_t alignmentOffset; - ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures, - d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR); - ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes, - d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR); - ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR); + cv::cudev::Texture texHaarFeatures; + cv::cudev::Texture texHaarClassifierNodes; + if (bTexCacheCascade) { + texHaarFeatures = cv::cudev::Texture(sizeof(HaarFeature64) * haar.NumFeatures, reinterpret_cast(d_HaarFeatures.ptr())); + texHaarClassifierNodes = cv::cudev::Texture(sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes, reinterpret_cast(d_HaarNodes.ptr())); } Ncv32u stageStartAnchorParallel = 0; @@ -1130,26 +985,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &integral, dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL), anchorsRoi.height); dim3 block1(NUM_THREADS_ANCHORSPARALLEL); - applyHaarClassifierAnchorParallelDynTemplate( - true, //tbInitMaskPositively - bTexCacheIImg, //tbCacheTextureIImg - bTexCacheCascade, //tbCacheTextureCascade - pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector - bDoAtomicCompaction, //tbDoAtomicCompaction - grid1, - block1, - cuStream, - integral.ptr(), integral.stride(), - d_weights.ptr(), d_weights.stride(), - d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), - d_ptrNowData->ptr(), - bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), - 0, - d_pixelMask.stride(), - anchorsRoi, - pixParallelStageStops[pixParallelStageStopsIndex], - pixParallelStageStops[pixParallelStageStopsIndex+1], - scaleAreaPixels); + applyHaarClassifierAnchorParallelDynTemplate( true, bTexCacheIImg, bTexCacheCascade, pixParallelStageStops[pixParallelStageStopsIndex] != 0, bDoAtomicCompaction, grid1, block1, cuStream, + texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), + d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), 0, d_pixelMask.stride(), anchorsRoi, pixParallelStageStops[pixParallelStageStopsIndex], + pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels); ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); if (bDoAtomicCompaction) @@ -1200,26 +1039,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &integral, } dim3 block2(NUM_THREADS_ANCHORSPARALLEL); - applyHaarClassifierAnchorParallelDynTemplate( - false, //tbInitMaskPositively - bTexCacheIImg, //tbCacheTextureIImg - bTexCacheCascade, //tbCacheTextureCascade - pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector - bDoAtomicCompaction, //tbDoAtomicCompaction - grid2, - block2, - cuStream, - integral.ptr(), integral.stride(), - d_weights.ptr(), d_weights.stride(), - d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), - d_ptrNowData->ptr(), - bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), - numDetections, - d_pixelMask.stride(), - anchorsRoi, - pixParallelStageStops[pixParallelStageStopsIndex], - pixParallelStageStops[pixParallelStageStopsIndex+1], - scaleAreaPixels); + applyHaarClassifierAnchorParallelDynTemplate( false, bTexCacheIImg, bTexCacheCascade, pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements, bDoAtomicCompaction, + grid2, block2, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), + d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections, d_pixelMask.stride(), anchorsRoi, + pixParallelStageStops[pixParallelStageStopsIndex], pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels); ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); if (bDoAtomicCompaction) @@ -1263,24 +1086,9 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &integral, } dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL); - applyHaarClassifierClassifierParallelDynTemplate( - bTexCacheIImg, //tbCacheTextureIImg - bTexCacheCascade, //tbCacheTextureCascade - bDoAtomicCompaction, //tbDoAtomicCompaction - grid3, - block3, - cuStream, - integral.ptr(), integral.stride(), - d_weights.ptr(), d_weights.stride(), - d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), - d_ptrNowData->ptr(), - bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), - numDetections, - d_pixelMask.stride(), - anchorsRoi, - stageMiddleSwitch, - stageEndClassifierParallel, - scaleAreaPixels); + applyHaarClassifierClassifierParallelDynTemplate(bTexCacheIImg, bTexCacheCascade, bDoAtomicCompaction, grid3, block3, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), + d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections, + d_pixelMask.stride(), anchorsRoi, stageMiddleSwitch, stageEndClassifierParallel, scaleAreaPixels); ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); if (bDoAtomicCompaction) diff --git a/modules/cudalegacy/src/cuda/NPP_staging.cu b/modules/cudalegacy/src/cuda/NPP_staging.cu index 90880d56cc5..6626526f737 100644 --- a/modules/cudalegacy/src/cuda/NPP_staging.cu +++ b/modules/cudalegacy/src/cuda/NPP_staging.cu @@ -48,12 +48,7 @@ #include "opencv2/cudev.hpp" #include "opencv2/cudalegacy/NPP_staging.hpp" - - -texture tex8u; -texture tex32u; -texture tex64u; - +#include //============================================================================== // @@ -71,7 +66,6 @@ cudaStream_t nppStGetActiveCUDAstream(void) } - cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream) { cudaStream_t tmp = nppStream; @@ -117,25 +111,25 @@ private: template -inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs); +inline __device__ T readElem(cv::cudev::TexturePtr tex8u, T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs); template<> -inline __device__ Ncv8u readElem(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs) +inline __device__ Ncv8u readElem(cv::cudev::TexturePtr tex8u, Ncv8u* d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs) { - return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs); + return tex8u(texOffs + srcStride * blockIdx.x + curElemOffs); } template<> -inline __device__ Ncv32u readElem(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs) +inline __device__ Ncv32u readElem(cv::cudev::TexturePtr tex8u, Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs) { return d_src[curElemOffs]; } template<> -inline __device__ Ncv32f readElem(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs) +inline __device__ Ncv32f readElem(cv::cudev::TexturePtr tex8u, Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs) { return d_src[curElemOffs]; } @@ -160,8 +154,7 @@ inline __device__ Ncv32f readElem(Ncv32f *d_src, Ncv32u texOffs, Ncv32u * \return None */ template -__global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride, - T_out *d_II, Ncv32u IIstride) +__global__ void scanRows(cv::cudev::TexturePtr tex8u, T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride, T_out *d_II, Ncv32u IIstride) { //advance pointers to the current line if (sizeof(T_in) != 1) @@ -190,7 +183,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr if (curElemOffs < srcWidth) { //load elements - curElem = readElem(d_src, texOffs, srcStride, curElemOffs); + curElem = readElem(tex8u, d_src, texOffs, srcStride, curElemOffs); } curElemMod = _scanElemOp::scanElemOp(curElem); @@ -224,25 +217,9 @@ template NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride, T_out *d_dst, Ncv32u dstStride, NcvSize32u roi) { - cudaChannelFormatDesc cfdTex; - size_t alignmentOffset = 0; - if (sizeof(T_in) == 1) - { - cfdTex = cudaCreateChannelDesc(); - ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR); - if (alignmentOffset > 0) - { - ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR); - } - } - scanRows - - <<>> - (d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride); - + cv::cudev::Texture tex8u(static_cast(roi.height * srcStride), (Ncv8u*)d_src); + scanRows <<>> (tex8u, d_src, 0, roi.width, srcStride, d_dst, dstStride); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); - return NPPST_SUCCESS; } @@ -585,59 +562,25 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep, const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32; const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8; - -template -__device__ T getElem_Decimate(Ncv32u x, T *d_src); - - -template<> -__device__ Ncv32u getElem_Decimate(Ncv32u x, Ncv32u *d_src) -{ - return tex1Dfetch(tex32u, x); -} - - -template<> -__device__ Ncv32u getElem_Decimate(Ncv32u x, Ncv32u *d_src) -{ - return d_src[x]; -} - - -template<> -__device__ Ncv64u getElem_Decimate(Ncv32u x, Ncv64u *d_src) -{ - uint2 tmp = tex1Dfetch(tex64u, x); - Ncv64u res = (Ncv64u)tmp.y; - res <<= 32; - res |= tmp.x; - return res; -} - - -template<> -__device__ Ncv64u getElem_Decimate(Ncv32u x, Ncv64u *d_src) +template +__global__ void decimate_C1R(T* d_src, Ncv32u srcStep, T* d_dst, Ncv32u dstStep, NcvSize32u dstRoi, Ncv32u scale) { - return d_src[x]; + int curX = blockIdx.x * blockDim.x + threadIdx.x; + int curY = blockIdx.y * blockDim.y + threadIdx.y; + if (curX >= dstRoi.width || curY >= dstRoi.height) return; + d_dst[curY * dstStep + curX] = d_src[(curY * srcStep + curX) * scale]; } - -template -__global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep, - NcvSize32u dstRoi, Ncv32u scale) +template +__global__ void decimate_C1R(cv::cudev::TexturePtr texSrc, Ncv32u srcStep, T* d_dst, Ncv32u dstStep, + NcvSize32u dstRoi, Ncv32u scale) { int curX = blockIdx.x * blockDim.x + threadIdx.x; int curY = blockIdx.y * blockDim.y + threadIdx.y; - - if (curX >= dstRoi.width || curY >= dstRoi.height) - { - return; - } - - d_dst[curY * dstStep + curX] = getElem_Decimate((curY * srcStep + curX) * scale, d_src); + if (curX >= dstRoi.width || curY >= dstRoi.height) return; + d_dst[curY * dstStep + curX] = texSrc((curY * srcStep + curX) * scale); } - template static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep, @@ -659,39 +602,12 @@ static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep, dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X, (dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y); dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y); - - if (!readThruTexture) - { - decimate_C1R - - <<>> - (d_src, srcStep, d_dst, dstStep, dstRoi, scale); + if (!readThruTexture) { + decimate_C1R<<>>(d_src, srcStep, d_dst, dstStep, dstRoi, scale); } - else - { - cudaChannelFormatDesc cfdTexSrc; - - if (sizeof(T) == sizeof(Ncv32u)) - { - cfdTexSrc = cudaCreateChannelDesc(); - - size_t alignmentOffset; - ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR); - ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); - } - else - { - cfdTexSrc = cudaCreateChannelDesc(); - - size_t alignmentOffset; - ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR); - ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); - } - - decimate_C1R - - <<>> - (d_src, srcStep, d_dst, dstStep, dstRoi, scale); + else { + cv::cudev::Texture texSrc(srcRoi.height * srcStep * sizeof(T), d_src); + decimate_C1R << > > (texSrc, srcStep, d_dst, dstStep, dstRoi, scale); } ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); @@ -753,11 +669,7 @@ static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep, implementNppDecimate(32, u) -implementNppDecimate(32, s) -implementNppDecimate(32, f) implementNppDecimate(64, u) -implementNppDecimate(64, s) -implementNppDecimate(64, f) implementNppDecimateHost(32, u) implementNppDecimateHost(32, s) implementNppDecimateHost(32, f) @@ -776,43 +688,29 @@ implementNppDecimateHost(64, f) const Ncv32u NUM_RECTSTDDEV_THREADS = 128; -template -__device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum) +template +__device__ Ncv32u getElemSum(Ptr2D tex, Ncv32u x, Ncv32u *d_sum) { if (tbCacheTexture) - { - return tex1Dfetch(tex32u, x); - } + return tex(x); else - { return d_sum[x]; - } } -template -__device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum) +template +__device__ Ncv64u getElemSqSum(Ptr2D tex, Ncv32u x, Ncv64u *d_sqsum) { if (tbCacheTexture) - { - uint2 tmp = tex1Dfetch(tex64u, x); - Ncv64u res = (Ncv64u)tmp.y; - res <<= 32; - res |= tmp.x; - return res; - } + return tex(x); else - { return d_sqsum[x]; - } } template -__global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep, - Ncv64u *d_sqsum, Ncv32u sqsumStep, - Ncv32f *d_norm, Ncv32u normStep, - NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea) +__global__ void rectStdDev_32f_C1R(cv::cudev::TexturePtr texSum, cv::cudev::TexturePtr texSumSq, Ncv32u *d_sum, Ncv32u sumStep, Ncv64u *d_sqsum, Ncv32u sqsumStep, + Ncv32f *d_norm, Ncv32u normStep, NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea) { Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x; if (x_offs >= roi.width) @@ -824,17 +722,17 @@ __global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep, Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs; //OPT: try swapping order (could change cache hit/miss ratio) - Ncv32u sum_tl = getElemSum(sum_offset + rect.y * sumStep + rect.x, d_sum); - Ncv32u sum_bl = getElemSum(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum); - Ncv32u sum_tr = getElemSum(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum); - Ncv32u sum_br = getElemSum(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum); + Ncv32u sum_tl = getElemSum(texSum, sum_offset + rect.y * sumStep + rect.x, d_sum); + Ncv32u sum_bl = getElemSum(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum); + Ncv32u sum_tr = getElemSum(texSum, sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum); + Ncv32u sum_br = getElemSum(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum); Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl; Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br; - sqsum_tl = getElemSqSum(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum); - sqsum_bl = getElemSqSum(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum); - sqsum_tr = getElemSqSum(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum); - sqsum_br = getElemSqSum(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum); + sqsum_tl = getElemSqSum(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum); + sqsum_bl = getElemSqSum(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum); + sqsum_tr = getElemSqSum(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum); + sqsum_br = getElemSqSum(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum); Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl; Ncv32f mean = sum_val * invRectArea; @@ -897,31 +795,12 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep, dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height); dim3 block(NUM_RECTSTDDEV_THREADS); + cv::cudev::Texture texSum((roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u), d_sum); + cv::cudev::Texture texSumSq((roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u), d_sqsum); if (!readThruTexture) - { - rectStdDev_32f_C1R - - <<>> - (d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea); - } + rectStdDev_32f_C1R<<>>(texSum, texSumSq, d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea); else - { - cudaChannelFormatDesc cfdTexSrc; - cudaChannelFormatDesc cfdTexSqr; - cfdTexSrc = cudaCreateChannelDesc(); - cfdTexSqr = cudaCreateChannelDesc(); - - size_t alignmentOffset; - ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR); - ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); - ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR); - ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); - - rectStdDev_32f_C1R - - <<>> - (NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea); - } + rectStdDev_32f_C1R<<>>(texSum, texSumSq, NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); @@ -1553,40 +1432,24 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen, // //============================================================================== - -texture texSrc; -texture texKernel; - - -__forceinline__ __device__ float getValueMirrorRow(const int rowOffset, - int i, - int w) +__forceinline__ __device__ float getValueMirrorRow(cv::cudev::TexturePtr< Ncv32f> tex, const int rowOffset, int i, int w) { if (i < 0) i = 1 - i; if (i >= w) i = w + w - i - 1; - return tex1Dfetch (texSrc, rowOffset + i); + return tex(rowOffset + i); } -__forceinline__ __device__ float getValueMirrorColumn(const int offset, - const int rowStep, - int j, - int h) +__forceinline__ __device__ float getValueMirrorColumn(cv::cudev::TexturePtr< Ncv32f> tex, const int offset, const int rowStep, int j, int h) { if (j < 0) j = 1 - j; if (j >= h) j = h + h - j - 1; - return tex1Dfetch (texSrc, offset + j * rowStep); + return tex(offset + j * rowStep); } -__global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep, - Ncv32f *pDst, - NcvSize32u dstSize, - Ncv32u dstStep, - NcvRect32u roi, - Ncv32s nKernelSize, - Ncv32s nAnchor, - Ncv32f multiplier) +__global__ void FilterRowBorderMirror_32f_C1R(cv::cudev::TexturePtr texSrc, cv::cudev::TexturePtr texKernel1, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep, + NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier) { // position within ROI const int ix = blockDim.x * blockIdx.x + threadIdx.x; @@ -1606,22 +1469,16 @@ __global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep, float sum = 0.0f; for (int m = 0; m < nKernelSize; ++m) { - sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width) - * tex1Dfetch (texKernel, m); + sum += getValueMirrorRow(texSrc, rowOffset, ix + m - p, roi.width) + * texKernel1(m); } pDst[iy * dstStep + ix] = sum * multiplier; } -__global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep, - Ncv32f *pDst, - NcvSize32u dstSize, - Ncv32u dstStep, - NcvRect32u roi, - Ncv32s nKernelSize, - Ncv32s nAnchor, - Ncv32f multiplier) +__global__ void FilterColumnBorderMirror_32f_C1R(cv::cudev::TexturePtr texSrc, cv::cudev::TexturePtr texKernel, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep, + NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier) { const int ix = blockDim.x * blockIdx.x + threadIdx.x; const int iy = blockDim.y * blockIdx.y + threadIdx.y; @@ -1638,15 +1495,15 @@ __global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep, float sum = 0.0f; for (int m = 0; m < nKernelSize; ++m) { - sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height) - * tex1Dfetch (texKernel, m); + sum += getValueMirrorColumn(texSrc, offset, srcStep, iy + m - p, roi.height) + * texKernel(m); } pDst[ix + iy * dstStep] = sum * multiplier; } -NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, +NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc, NcvSize32u srcSize, Ncv32u nSrcStep, Ncv32f *pDst, @@ -1654,7 +1511,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, Ncv32u nDstStep, NcvRect32u oROI, NppStBorderType borderType, - const Ncv32f *pKernel, + Ncv32f *pKernel, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier) @@ -1686,12 +1543,8 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, oROI.height = srcSize.height - oROI.y; } - cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc (); - texSrc.normalized = false; - texKernel.normalized = false; - - cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep); - cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f)); + cv::cudev::Texture texSrc(srcSize.height * nSrcStep, pSrc); + cv::cudev::Texture texKernel(nKernelSize * sizeof(Ncv32f), pKernel); dim3 ctaSize (32, 6); dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x, @@ -1706,8 +1559,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, case nppStBorderWrap: return NPPST_ERROR; case nppStBorderMirror: - FilterRowBorderMirror_32f_C1R <<>> - (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier); + FilterRowBorderMirror_32f_C1R <<>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); break; default: @@ -1718,7 +1570,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, } -NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc, +NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc, NcvSize32u srcSize, Ncv32u nSrcStep, Ncv32f *pDst, @@ -1726,7 +1578,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc, Ncv32u nDstStep, NcvRect32u oROI, NppStBorderType borderType, - const Ncv32f *pKernel, + Ncv32f *pKernel, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier) @@ -1758,12 +1610,8 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc, oROI.height = srcSize.height - oROI.y; } - cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc (); - texSrc.normalized = false; - texKernel.normalized = false; - - cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep); - cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f)); + cv::cudev::Texture texSrc(srcSize.height * nSrcStep, pSrc); + cv::cudev::Texture texKernel(nKernelSize * sizeof(Ncv32f), pKernel); dim3 ctaSize (32, 6); dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x, @@ -1776,8 +1624,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc, case nppStBorderWrap: return NPPST_ERROR; case nppStBorderMirror: - FilterColumnBorderMirror_32f_C1R <<>> - (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier); + FilterColumnBorderMirror_32f_C1R <<>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); break; default: @@ -1800,16 +1647,11 @@ inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom) return (num + denom - 1)/denom; } - -texture tex_src1; -texture tex_src0; - - -__global__ void BlendFramesKernel(const float *u, const float *v, // forward flow - const float *ur, const float *vr, // backward flow - const float *o0, const float *o1, // coverage masks - int w, int h, int s, - float theta, float *out) +__global__ void BlendFramesKernel(cv::cudev::TexturePtr texSrc0, cv::cudev::TexturePtr texSrc1, + const float *u, const float *v, // forward flow + const float *ur, const float *vr, // backward flow + const float *o0, const float *o1, // coverage masks + int w, int h, int s, float theta, float *out) { const int ix = threadIdx.x + blockDim.x * blockIdx.x; const int iy = threadIdx.y + blockDim.y * blockIdx.y; @@ -1829,27 +1671,17 @@ __global__ void BlendFramesKernel(const float *u, const float *v, // forward f bool b0 = o0[pos] > 1e-4f; bool b1 = o1[pos] > 1e-4f; - if (b0 && b1) - { - // pixel is visible on both frames - out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) + - tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta; - } - else if (b0) - { - // visible on the first frame only - out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta); - } - else - { - // visible on the second frame only - out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta)); - } + if (b0 && b1) // pixel is visible on both frames + out[pos] = texSrc0(y - _v * theta, x - _u * theta)* (1.0f - theta) + texSrc0(y + _v * (1.0f - theta), x + _u * (1.0f - theta)) * theta; + else if (b0) // visible on the first frame only + out[pos] = texSrc0(y - _v * theta, x - _u * theta); + else // visible on the second frame only + out[pos] = texSrc1(y - _vr * (1.0f - theta), x - _ur * (1.0f - theta)); } -NCVStatus BlendFrames(const Ncv32f *src0, - const Ncv32f *src1, +NCVStatus BlendFrames(Ncv32f *src0, + Ncv32f *src1, const Ncv32f *ufi, const Ncv32f *vfi, const Ncv32f *ubi, @@ -1862,29 +1694,13 @@ NCVStatus BlendFrames(const Ncv32f *src0, Ncv32f theta, Ncv32f *out) { - tex_src1.addressMode[0] = cudaAddressModeClamp; - tex_src1.addressMode[1] = cudaAddressModeClamp; - tex_src1.filterMode = cudaFilterModeLinear; - tex_src1.normalized = false; - - tex_src0.addressMode[0] = cudaAddressModeClamp; - tex_src0.addressMode[1] = cudaAddressModeClamp; - tex_src0.filterMode = cudaFilterModeLinear; - tex_src0.normalized = false; - - cudaChannelFormatDesc desc = cudaCreateChannelDesc (); const Ncv32u pitch = stride * sizeof (float); - ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR); - ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR); - + cv::cudev::Texture texSrc0(height, width, src0, pitch, false, cudaFilterModeLinear); + cv::cudev::Texture texSrc1(height, width, src1, pitch, false, cudaFilterModeLinear); dim3 threads (32, 4); dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y)); - - BlendFramesKernel<<>> - (ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out); - + BlendFramesKernel<<>>(texSrc0, texSrc1, ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out); ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR); - return NPPST_SUCCESS; } @@ -2255,44 +2071,27 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc, // //============================================================================== - -texture texSrc2D; - - __forceinline__ -__device__ float processLine(int spos, - float xmin, - float xmax, - int ixmin, - int ixmax, - float fxmin, - float cxmax) +__device__ float processLine(cv::cudev::TexturePtr tex, int spos, float xmin, float xmax, int ixmin, int ixmax, float fxmin, float cxmax) { // first element float wsum = 1.0f - xmin + fxmin; - float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin); + float sum = tex( spos) * (1.0f - xmin + fxmin); spos++; for (int ix = ixmin + 1; ix < ixmax; ++ix) { - sum += tex1Dfetch(texSrc, spos); + sum += tex(spos); spos++; wsum += 1.0f; } - sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax); + sum += tex(spos) * (cxmax - xmax); wsum += cxmax - xmax; return sum / wsum; } -__global__ void resizeSuperSample_32f(NcvSize32u srcSize, - Ncv32u srcStep, - NcvRect32u srcROI, - Ncv32f *dst, - NcvSize32u dstSize, - Ncv32u dstStep, - NcvRect32u dstROI, - Ncv32f scaleX, - Ncv32f scaleY) +__global__ void resizeSuperSample_32f(cv::cudev::TexturePtr texSrc, NcvSize32u srcSize, Ncv32u srcStep, NcvRect32u srcROI, Ncv32f *dst, NcvSize32u dstSize, Ncv32u dstStep, + NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY) { // position within dst ROI const int ix = blockIdx.x * blockDim.x + threadIdx.x; @@ -2332,18 +2131,18 @@ __global__ void resizeSuperSample_32f(NcvSize32u srcSize, float wsum = 1.0f - yBegin + floorYBegin; - float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, + float sum = processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, ceilXEnd) * (1.0f - yBegin + floorYBegin); pos += srcStep; for (int iy = iYBegin + 1; iy < iYEnd; ++iy) { - sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, + sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, ceilXEnd); pos += srcStep; wsum += 1.0f; } - sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, + sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, ceilXEnd) * (ceilYEnd - yEnd); wsum += ceilYEnd - yEnd; sum /= wsum; @@ -2372,14 +2171,7 @@ __device__ float bicubicCoeff(float x_) } -__global__ void resizeBicubic(NcvSize32u srcSize, - NcvRect32u srcROI, - NcvSize32u dstSize, - Ncv32u dstStep, - Ncv32f *dst, - NcvRect32u dstROI, - Ncv32f scaleX, - Ncv32f scaleY) +__global__ void resizeBicubic(cv::cudev::TexturePtr texSrc, NcvSize32u srcSize, NcvRect32u srcROI, NcvSize32u dstSize, Ncv32u dstStep, Ncv32f *dst, NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY) { const int ix = blockIdx.x * blockDim.x + threadIdx.x; const int iy = blockIdx.y * blockDim.y + threadIdx.y; @@ -2433,7 +2225,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize, float wx = bicubicCoeff (xDist); float wy = bicubicCoeff (yDist); wx *= wy; - sum += wx * tex2D (texSrc2D, cx * dx, cy * dy); + sum += wx * texSrc(cy * dy, cx * dx); wsum += wx; } } @@ -2441,7 +2233,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize, } -NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc, +NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc, NcvSize32u srcSize, Ncv32u nSrcStep, NcvRect32u srcROI, @@ -2469,33 +2261,17 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc, if (interpolation == nppStSupersample) { - // bind texture - cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep); - // invoke kernel + cv::cudev::Texture texSrc(srcSize.height * nSrcStep, pSrc); dim3 ctaSize (32, 6); - dim3 gridSize ((dstROI.width + ctaSize.x - 1) / ctaSize.x, - (dstROI.height + ctaSize.y - 1) / ctaSize.y); - - resizeSuperSample_32f <<>> - (srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor); + dim3 gridSize ((dstROI.width + ctaSize.x - 1) / ctaSize.x,(dstROI.height + ctaSize.y - 1) / ctaSize.y); + resizeSuperSample_32f <<>> (texSrc, srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor); } else if (interpolation == nppStBicubic) { - texSrc2D.addressMode[0] = cudaAddressModeMirror; - texSrc2D.addressMode[1] = cudaAddressModeMirror; - texSrc2D.normalized = true; - - cudaChannelFormatDesc desc = cudaCreateChannelDesc (); - - cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height, - nSrcStep); - + cv::cudev::Texture texSrc(srcSize.height, srcSize.width, pSrc, nSrcStep, true, cudaFilterModePoint, cudaAddressModeMirror); dim3 ctaSize (32, 6); - dim3 gridSize ((dstSize.width + ctaSize.x - 1) / ctaSize.x, - (dstSize.height + ctaSize.y - 1) / ctaSize.y); - - resizeBicubic <<>> - (srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor); + dim3 gridSize ((dstSize.width + ctaSize.x - 1) / ctaSize.x, (dstSize.height + ctaSize.y - 1) / ctaSize.y); + resizeBicubic <<>> (texSrc, srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor); } else { diff --git a/modules/cudalegacy/src/cuda/bm.cu b/modules/cudalegacy/src/cuda/bm.cu index 1307a8e3275..546f0903b05 100644 --- a/modules/cudalegacy/src/cuda/bm.cu +++ b/modules/cudalegacy/src/cuda/bm.cu @@ -46,29 +46,27 @@ #include "opencv2/core/cuda/limits.hpp" #include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/reduce.hpp" +#include using namespace cv::cuda; using namespace cv::cuda::device; namespace optflowbm { - texture tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp); - texture tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp); - - __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize) + __device__ int cmpBlocks(cv::cudev::TexturePtr texCurr, cv::cudev::TexturePtr texPrev, int X1, int Y1, int X2, int Y2, int2 blockSize) { int s = 0; for (int y = 0; y < blockSize.y; ++y) { for (int x = 0; x < blockSize.x; ++x) - s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y)); + s += ::abs(texPrev(Y1 + y, X1 + x) -texCurr(Y2 + y, X2 + x)); } return s; } - __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious, + __global__ void calcOptFlowBM(cv::cudev::TexturePtr texPrev, cv::cudev::TexturePtr texCurr, PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious, const int maxX, const int maxY, const int acceptLevel, const int escapeLevel, const short2* ss, const int ssCount) { @@ -90,7 +88,7 @@ namespace optflowbm int dist = numeric_limits::max(); if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY) - dist = cmpBlocks(X1, Y1, X2, Y2, blockSize); + dist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize); int countMin = 1; int sumx = offX; @@ -111,7 +109,7 @@ namespace optflowbm if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY) { - const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize); + const int tmpDist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize); if (tmpDist < acceptLevel) { sumx = dx; @@ -151,16 +149,12 @@ namespace optflowbm void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious, int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream) { - bindTexture(&tex_prev, prev); - bindTexture(&tex_curr, curr); - + cv::cudev::Texture texPrev(prev); + cv::cudev::Texture texCurr(curr); const dim3 block(32, 8); const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y)); - - calcOptFlowBM<<>>(velx, vely, blockSize, shiftSize, usePrevious, - maxX, maxY, acceptLevel, escapeLevel, ss, ssCount); + calcOptFlowBM<<>>(texPrev, texCurr, velx, vely, blockSize, shiftSize, usePrevious, maxX, maxY, acceptLevel, escapeLevel, ss, ssCount); cudaSafeCall( cudaGetLastError() ); - if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } diff --git a/modules/cudalegacy/test/TestHypothesesGrow.cpp b/modules/cudalegacy/test/TestHypothesesGrow.cpp index e7fe4d939df..ad4c3c9df3c 100644 --- a/modules/cudalegacy/test/TestHypothesesGrow.cpp +++ b/modules/cudalegacy/test/TestHypothesesGrow.cpp @@ -100,7 +100,8 @@ bool TestHypothesesGrow::process() NCV_SKIP_COND_BEGIN ncvAssertReturn(this->src.fill(h_vecSrc), false); - memset(h_vecDst.ptr(), 0, h_vecDst.length() * sizeof(NcvRect32u)); + + *h_vecDst.ptr() = {}; NCVVectorReuse h_vecDst_as32u(h_vecDst.getSegment(), lenDst * sizeof(NcvRect32u) / sizeof(Ncv32u)); ncvAssertReturn(h_vecDst_as32u.isMemReused(), false); ncvAssertReturn(this->src.fill(h_vecDst_as32u), false); diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu index 5c12860620a..c7d72bfa9f8 100644 --- a/modules/cudaobjdetect/src/cuda/hog.cu +++ b/modules/cudaobjdetect/src/cuda/hog.cu @@ -46,6 +46,7 @@ #include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/functional.hpp" #include "opencv2/core/cuda/warp_shuffle.hpp" +#include namespace cv { namespace cuda { namespace device { @@ -825,64 +826,44 @@ namespace cv { namespace cuda { namespace device //------------------------------------------------------------------- // Resize - texture resize8UC4_tex; - texture resize8UC1_tex; - - __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz dst, int colOfs) + __global__ void resize_for_hog_kernel(cv::cudev::TexturePtr src, float sx, float sy, PtrStepSz dst) { unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < dst.cols && y < dst.rows) - dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255; + dst.ptr(y)[x] = src(x * sx, y * sy) * 255; } - __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz dst, int colOfs) + __global__ void resize_for_hog_kernel(cv::cudev::TexturePtr src, float sx, float sy, PtrStepSz dst) { unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < dst.cols && y < dst.rows) { - float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy); + uchar4 val = src(x * sx, y * sy); dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255); } } - template - static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex) + template + static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst) { - tex.filterMode = cudaFilterModeLinear; - - size_t texOfs = 0; - int colOfs = 0; - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) ); - - if (texOfs != 0) - { - colOfs = static_cast( texOfs/sizeof(T) ); - cudaSafeCall( cudaUnbindTexture(tex) ); - cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) ); - } - + cv::cudev::Texture tex(src.rows, src.cols, src.data, src.step, false, cudaFilterModeLinear, cudaAddressModeClamp, cudaReadModeNormalizedFloat); dim3 threads(32, 8); dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y)); float sx = static_cast(src.cols) / dst.cols; float sy = static_cast(src.rows) / dst.rows; - resize_for_hog_kernel<<>>(sx, sy, (PtrStepSz)dst, colOfs); + resize_for_hog_kernel<<>>(tex, sx, sy, (PtrStepSz)dst); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - - cudaSafeCall( cudaUnbindTexture(tex) ); } - void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog (src, dst, resize8UC1_tex); } - void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog(src, dst, resize8UC4_tex); } + void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog (src, dst); } + void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog(src, dst); } } // namespace hog }}} // namespace cv { namespace cuda { namespace cudev diff --git a/modules/cudaobjdetect/test/test_objdetect.cpp b/modules/cudaobjdetect/test/test_objdetect.cpp index 4843cc483ef..b12ad37f6dc 100644 --- a/modules/cudaobjdetect/test/test_objdetect.cpp +++ b/modules/cudaobjdetect/test/test_objdetect.cpp @@ -222,7 +222,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES); */ //============== caltech hog tests =====================// -struct CalTech : public ::testing::TestWithParam > +struct CalTech : public ::testing::TestWithParam > { cv::cuda::DeviceInfo devInfo; cv::Mat img; @@ -232,7 +232,13 @@ struct CalTech : public ::testing::TestWithParam("caltech/image_00000009_0.png", "caltech/image_00000032_0.png", "caltech/image_00000165_0.png", "caltech/image_00000261_0.png", "caltech/image_00000469_0.png", - "caltech/image_00000527_0.png", "caltech/image_00000574_0.png"))); + "caltech/image_00000527_0.png", "caltech/image_00000574_0.png"), testing::Values(GREYSCALE))); //------------------------variable GPU HOG Tests------------------------// diff --git a/modules/cudaoptflow/src/cuda/pyrlk.cu b/modules/cudaoptflow/src/cuda/pyrlk.cu index ca9759c2e53..da53046eae4 100644 --- a/modules/cudaoptflow/src/cuda/pyrlk.cu +++ b/modules/cudaoptflow/src/cuda/pyrlk.cu @@ -50,8 +50,7 @@ #include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/filters.hpp" #include "opencv2/core/cuda/border_interpolate.hpp" - -#include +#include using namespace cv::cuda; using namespace cv::cuda::device; @@ -64,224 +63,6 @@ namespace pyrlk __constant__ int c_halfWin_y; __constant__ int c_iters; - texture tex_I8U(false, cudaFilterModeLinear, cudaAddressModeClamp); - texture tex_I8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp); - - texture tex_I16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp); - - - texture tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp); - texture tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp); - - texture tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp); - - texture tex_J8U(false, cudaFilterModeLinear, cudaAddressModeClamp); - texture tex_J8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp); - - texture tex_J16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp); - - - texture tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp); - texture tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp); - - - template struct Tex_I - { - static __host__ __forceinline__ void bindTexture_(PtrStepSz::vec_type> I) - { - CV_UNUSED(I); - } - }; - - template <> struct Tex_I<1, uchar> - { - static __device__ __forceinline__ float read(float x, float y) - { - return tex2D(tex_I8U, x, y); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& I) - { - bindTexture(&tex_I8U, I); - } - }; - template <> struct Tex_I<1, ushort> - { - static __device__ __forceinline__ float read(float x, float y) - { - return 0.0; - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& I) - { - CV_UNUSED(I); - } - }; - template <> struct Tex_I<1, int> - { - static __device__ __forceinline__ float read(float x, float y) - { - return 0.0; - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& I) - { - CV_UNUSED(I); - } - }; - template <> struct Tex_I<1, float> - { - static __device__ __forceinline__ float read(float x, float y) - { - return tex2D(tex_If, x, y); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& I) - { - bindTexture(&tex_If, I); - } - }; - // ****************** 3 channel specializations ************************ - template <> struct Tex_I<3, uchar> - { - static __device__ __forceinline__ float3 read(float x, float y) - { - return make_float3(0,0,0); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz I) - { - CV_UNUSED(I); - } - }; - template <> struct Tex_I<3, ushort> - { - static __device__ __forceinline__ float3 read(float x, float y) - { - return make_float3(0, 0, 0); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz I) - { - CV_UNUSED(I); - } - }; - template <> struct Tex_I<3, int> - { - static __device__ __forceinline__ float3 read(float x, float y) - { - return make_float3(0, 0, 0); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz I) - { - CV_UNUSED(I); - } - }; - template <> struct Tex_I<3, float> - { - static __device__ __forceinline__ float3 read(float x, float y) - { - return make_float3(0, 0, 0); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz I) - { - CV_UNUSED(I); - } - }; - // ****************** 4 channel specializations ************************ - - template <> struct Tex_I<4, uchar> - { - static __device__ __forceinline__ float4 read(float x, float y) - { - return tex2D(tex_I8UC4, x, y); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& I) - { - bindTexture(&tex_I8UC4, I); - } - }; - template <> struct Tex_I<4, ushort> - { - static __device__ __forceinline__ float4 read(float x, float y) - { - return tex2D(tex_I16UC4, x, y); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& I) - { - bindTexture(&tex_I16UC4, I); - } - }; - template <> struct Tex_I<4, float> - { - static __device__ __forceinline__ float4 read(float x, float y) - { - return tex2D(tex_If4, x, y); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& I) - { - bindTexture(&tex_If4, I); - } - }; - // ************* J *************** - template struct Tex_J - { - static __host__ __forceinline__ void bindTexture_(PtrStepSz::vec_type>& J) - { - CV_UNUSED(J); - } - }; - template <> struct Tex_J<1, uchar> - { - static __device__ __forceinline__ float read(float x, float y) - { - return tex2D(tex_J8U, x, y); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& J) - { - bindTexture(&tex_J8U, J); - } - }; - template <> struct Tex_J<1, float> - { - static __device__ __forceinline__ float read(float x, float y) - { - return tex2D(tex_Jf, x, y); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& J) - { - bindTexture(&tex_Jf, J); - } - }; - // ************* 4 channel specializations *************** - template <> struct Tex_J<4, uchar> - { - static __device__ __forceinline__ float4 read(float x, float y) - { - return tex2D(tex_J8UC4, x, y); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& J) - { - bindTexture(&tex_J8UC4, J); - } - }; - template <> struct Tex_J<4, ushort> - { - static __device__ __forceinline__ float4 read(float x, float y) - { - return tex2D(tex_J16UC4, x, y); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& J) - { - bindTexture(&tex_J16UC4, J); - } - }; - template <> struct Tex_J<4, float> - { - static __device__ __forceinline__ float4 read(float x, float y) - { - return tex2D(tex_Jf4, x, y); - } - static __host__ __forceinline__ void bindTexture_(PtrStepSz& J) - { - bindTexture(&tex_Jf4, J); - } - }; - __device__ __forceinline__ void accum(float& dst, const float& val) { dst += val; @@ -364,8 +145,8 @@ namespace pyrlk } }; - template - __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols) + template + __global__ void sparseKernel(const Ptr2D texI, const Ptr2D texJ, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols) { #if __CUDA_ARCH__ <= 110 const int BLOCK_SIZE = 128; @@ -413,15 +194,14 @@ namespace pyrlk float x = prevPt.x + xBase + 0.5f; float y = prevPt.y + yBase + 0.5f; - I_patch[i][j] = Tex_I::read(x, y); + I_patch[i][j] = texI(y, x); // Scharr Deriv + work_type dIdx = 3.0f * texI(y - 1, x + 1) + 10.0f * texI(y, x + 1) + 3.0f * texI(y + 1, x + 1) - + (3.0f * texI(y - 1, x - 1) + 10.0f * texI(y, x - 1) + 3.0f * texI(y + 1, x - 1)); - work_type dIdx = 3.0f * Tex_I::read(x+1, y-1) + 10.0f * Tex_I::read(x+1, y) + 3.0f * Tex_I::read(x+1, y+1) - - (3.0f * Tex_I::read(x-1, y-1) + 10.0f * Tex_I::read(x-1, y) + 3.0f * Tex_I::read(x-1, y+1)); - - work_type dIdy = 3.0f * Tex_I::read(x-1, y+1) + 10.0f * Tex_I::read(x, y+1) + 3.0f * Tex_I::read(x+1, y+1) - - (3.0f * Tex_I::read(x-1, y-1) + 10.0f * Tex_I::read(x, y-1) + 3.0f * Tex_I::read(x+1, y-1)); + work_type dIdy = 3.0f * texI(y + 1, x - 1) + 10.0f * texI(y + 1, x) + 3.0f * texI(y + 1, x + 1) - + (3.0f * texI(y - 1, x - 1) + 10.0f * texI(y - 1, x) + 3.0f * texI(y - 1, x + 1)); dIdx_patch[i][j] = dIdx; dIdy_patch[i][j] = dIdy; @@ -490,7 +270,8 @@ namespace pyrlk for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) { work_type I_val = I_patch[i][j]; - work_type J_val = Tex_J::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); + + work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f); work_type diff = (J_val - I_val) * 32.0f; @@ -533,7 +314,8 @@ namespace pyrlk for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) { work_type I_val = I_patch[i][j]; - work_type J_val = Tex_J::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); + + work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f); work_type diff = J_val - I_val; @@ -749,6 +531,27 @@ namespace pyrlk } } // __global__ void sparseKernel_ + // Specialization for non float data, cudaFilterModeLinear only compatible with cudaReadModeNormalizedFloat. + template class TextureLinear : public cv::cudev::Texture::vec_type, typename TypeVec::vec_type> { + public: + typedef typename TypeVec::vec_type elem_type; + typedef typename TypeVec::vec_type ret_type; + __host__ TextureLinear(PtrStepSz src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) : + cv::cudev::Texture(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeNormalizedFloat) + { + } + }; + + // Specialization for float data, cudaReadModeNormalizedFloat only compatible with cudaReadModeElementType. + template class TextureLinear : public cv::cudev::Texture::vec_type, typename TypeVec::vec_type> + { + public: + typedef typename TypeVec::vec_type float_type; + __host__ TextureLinear(PtrStepSz src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) : + cv::cudev::Texture (src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeElementType) + { + } + }; template class sparse_caller { @@ -756,16 +559,16 @@ namespace pyrlk static void call(PtrStepSz::vec_type> I, PtrStepSz::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, int level, dim3 block, cudaStream_t stream) { + typedef typename TypeVec::vec_type dType; + typedef typename TypeVec::vec_type rType; + TextureLinear texI(I); + TextureLinear texJ(J); dim3 grid(ptcount); - CV_UNUSED(I); - CV_UNUSED(J); if (level == 0 && err) - sparseKernel <<>>(prevPts, nextPts, status, err, level, rows, cols); + sparseKernel> << > > (texI, texJ, prevPts, nextPts, status, err, level, rows, cols); else - sparseKernel <<>>(prevPts, nextPts, status, err, level, rows, cols); - + sparseKernel> << > > (texI, texJ, prevPts, nextPts, status, err, level, rows, cols); cudaSafeCall(cudaGetLastError()); - if (stream == 0) cudaSafeCall(cudaDeviceSynchronize()); } @@ -903,8 +706,8 @@ namespace pyrlk }; - template - __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols) + template + __global__ void denseKernel(const Ptr2D texI, const Ptr2D texJ, PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols) { extern __shared__ int smem[]; @@ -925,15 +728,15 @@ namespace pyrlk float x = xBase - c_halfWin_x + j + 0.5f; float y = yBase - c_halfWin_y + i + 0.5f; - I_patch[i * patchWidth + j] = tex2D(tex_If, x, y); + I_patch[i * patchWidth + j] = texI(y, x); // Scharr Deriv - dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x+1, y-1) + 10 * tex2D(tex_If, x+1, y) + 3 * tex2D(tex_If, x+1, y+1) - - (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x-1, y) + 3 * tex2D(tex_If, x-1, y+1)); + dIdx_patch[i * patchWidth + j] = 3 * texI(y - 1, x + 1) + 10 * texI(y, x + 1) + 3 * texI(y + 1, x + 1) - + (3 * texI(y - 1, x - 1) + 10 * texI(y, x - 1) + 3 * texI(y + 1, x - 1)); - dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x-1, y+1) + 10 * tex2D(tex_If, x, y+1) + 3 * tex2D(tex_If, x+1, y+1) - - (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x, y-1) + 3 * tex2D(tex_If, x+1, y-1)); + dIdy_patch[i * patchWidth + j] = 3 * texI(y + 1, x - 1) + 10 * texI(y + 1,x) + 3 * texI(y+ 1, x + 1) - + (3 * texI(y - 1, x - 1) + 10 * texI(y - 1,x) + 3 * texI(y - 1, x + 1)); } } @@ -1004,7 +807,7 @@ namespace pyrlk for (int j = 0; j < c_winSize_x; ++j) { int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; - int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); + int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f); int diff = (J - I) * 32; @@ -1040,7 +843,8 @@ namespace pyrlk for (int j = 0; j < c_winSize_x; ++j) { int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; - int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); + + int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f); errval += ::abs(J - I); } @@ -1109,9 +913,6 @@ namespace pyrlk { sparse_caller::call, sparse_caller::call, sparse_caller::call, sparse_caller::call, sparse_caller::call } }; - Tex_I::bindTexture_(I); - Tex_J::bindTexture_(J); - funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount, level, block, stream); } @@ -1119,9 +920,8 @@ namespace pyrlk { dim3 block(16, 16); dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y)); - Tex_I<1, T>::bindTexture_(I); - Tex_J<1, T>::bindTexture_(J); - + TextureLinear<1, T> texI(I); + TextureLinear<1, T> texJ(J); int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); const int patchWidth = block.x + 2 * halfWin.x; const int patchHeight = block.y + 2 * halfWin.y; @@ -1129,12 +929,12 @@ namespace pyrlk if (err.data) { - denseKernel << > >(u, v, prevU, prevV, err, I.rows, I.cols); + denseKernel> << > >(texI, texJ, u, v, prevU, prevV, err, I.rows, I.cols); cudaSafeCall(cudaGetLastError()); } else { - denseKernel << > >(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols); + denseKernel> << > >(texI, texJ, u, v, prevU, prevV, PtrStepf(), I.rows, I.cols); cudaSafeCall(cudaGetLastError()); } diff --git a/modules/cudaoptflow/src/cuda/tvl1flow.cu b/modules/cudaoptflow/src/cuda/tvl1flow.cu index 7ee7b36e096..cc73d463197 100644 --- a/modules/cudaoptflow/src/cuda/tvl1flow.cu +++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu @@ -46,6 +46,7 @@ #include "opencv2/core/cuda/border_interpolate.hpp" #include "opencv2/core/cuda/limits.hpp" #include "opencv2/core/cuda.hpp" +#include using namespace cv::cuda; using namespace cv::cuda::device; @@ -102,63 +103,8 @@ namespace tvl1flow } } - struct SrcTex - { - virtual ~SrcTex() {} - - __device__ __forceinline__ virtual float I1(float x, float y) const = 0; - __device__ __forceinline__ virtual float I1x(float x, float y) const = 0; - __device__ __forceinline__ virtual float I1y(float x, float y) const = 0; - }; - - texture tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp); - texture tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp); - texture tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp); - struct SrcTexRef : SrcTex - { - __device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE - { - return tex2D(tex_I1, x, y); - } - __device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE - { - return tex2D(tex_I1x, x, y); - } - __device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE - { - return tex2D(tex_I1y, x, y); - } - }; - - struct SrcTexObj : SrcTex - { - __host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_) - : tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {} - - __device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE - { - return tex2D(tex_obj_I1, x, y); - } - __device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE - { - return tex2D(tex_obj_I1x, x, y); - } - __device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE - { - return tex2D(tex_obj_I1y, x, y); - } - - cudaTextureObject_t tex_obj_I1; - cudaTextureObject_t tex_obj_I1x; - cudaTextureObject_t tex_obj_I1y; - }; - - template < - typename T, - typename = typename std::enable_if::value>::type - > __global__ void warpBackwardKernel( - const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2, + const PtrStepSzf I0, const cv::cudev::TexturePtr I1, const cv::cudev::TexturePtr I1x, const cv::cudev::TexturePtr I1y, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho) { const int x = blockIdx.x * blockDim.x + threadIdx.x; @@ -189,11 +135,9 @@ namespace tvl1flow for (int cx = xmin; cx <= xmax; ++cx) { const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy); - - sum += w * src.I1(cx, cy); - sumx += w * src.I1x(cx, cy); - sumy += w * src.I1y(cx, cy); - + sum += w * I1(cy, cx); + sumx += w * I1x(cy, cx); + sumy += w * I1y(cy, cx); wsum += w; } } @@ -224,49 +168,14 @@ namespace tvl1flow PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho, cudaStream_t stream) { + cv::cudev::Texture texI1(I1); + cv::cudev::Texture texI1x(I1x); + cv::cudev::Texture texI1y(I1y); const dim3 block(32, 8); const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y)); - - bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30); - - if (cc30) - { - cudaTextureDesc texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = cudaAddressModeClamp; - texDesc.addressMode[1] = cudaAddressModeClamp; - texDesc.addressMode[2] = cudaAddressModeClamp; - - cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0; - - createTextureObjectPitch2D(&texObj_I1, I1, texDesc); - createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc); - createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc); - - warpBackwardKernel << > > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho); - cudaSafeCall(cudaGetLastError()); - - if (!stream) - cudaSafeCall(cudaDeviceSynchronize()); - else - cudaSafeCall(cudaStreamSynchronize(stream)); - - cudaSafeCall(cudaDestroyTextureObject(texObj_I1)); - cudaSafeCall(cudaDestroyTextureObject(texObj_I1x)); - cudaSafeCall(cudaDestroyTextureObject(texObj_I1y)); - } - else - { - bindTexture(&tex_I1, I1); - bindTexture(&tex_I1x, I1x); - bindTexture(&tex_I1y, I1y); - - warpBackwardKernel << > > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho); - cudaSafeCall(cudaGetLastError()); - - if (!stream) - cudaSafeCall(cudaDeviceSynchronize()); - } + warpBackwardKernel<< > > (I0, texI1, texI1x, texI1y , u1, u2, I1w, I1wx, I1wy, grad, rho); + if (!stream) + cudaSafeCall(cudaDeviceSynchronize()); } } diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu index 348556060d1..73df35ff63d 100644 --- a/modules/cudastereo/src/cuda/stereobm.cu +++ b/modules/cudastereo/src/cuda/stereobm.cu @@ -43,8 +43,10 @@ #if !defined CUDA_DISABLER #include "opencv2/core/cuda/common.hpp" +#include #include + namespace cv { namespace cuda { namespace device { namespace stereobm @@ -601,13 +603,12 @@ namespace cv { namespace cuda { namespace device /////////////////////////////////// Textureness filtering //////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// - texture texForTF; - - __device__ __forceinline__ float sobel(int x, int y) + __device__ __forceinline__ float sobel(cv::cudev::TexturePtr texSrc, int x, int y) { - float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) + - tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) + - tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1); + float conv = texSrc(y - 1, x - 1) * (-1) + texSrc(y - 1, x + 1) * (1) + + texSrc(y, x - 1) * (-2) + texSrc(y, x + 1) * (2) + + texSrc(y + 1, x - 1) * (-1) + texSrc(y + 1, x + 1) * (1); + return fabs(conv); } @@ -635,7 +636,7 @@ namespace cv { namespace cuda { namespace device #define RpT (2 * ROWSperTHREAD) // got experimentally - __global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold) + __global__ void textureness_kernel(cv::cudev::TexturePtr texSrc, PtrStepSzb disp, int winsz, float threshold) { int winsz2 = winsz/2; int n_dirty_pixels = (winsz2) * 2; @@ -657,9 +658,9 @@ namespace cv { namespace cuda { namespace device for(int i = y - winsz2; i <= y + winsz2; ++i) { - sum += sobel(x - winsz2, i); + sum += sobel(texSrc, x - winsz2, i); if (cols_extra) - sum_extra += sobel(x + blockDim.x - winsz2, i); + sum_extra += sobel(texSrc, x + blockDim.x - winsz2, i); } *cols = sum; if (cols_extra) @@ -675,12 +676,12 @@ namespace cv { namespace cuda { namespace device for(int y = beg_row + 1; y < end_row; ++y) { - sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2); + sum = sum - sobel(texSrc, x - winsz2, y - winsz2 - 1) + sobel(texSrc, x - winsz2, y + winsz2); *cols = sum; if (cols_extra) { - sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2); + sum_extra = sum_extra - sobel(texSrc, x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(texSrc, x + blockDim.x - winsz2, y + winsz2); *cols_extra = sum_extra; } @@ -697,28 +698,16 @@ namespace cv { namespace cuda { namespace device void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream) { avgTexturenessThreshold *= winsz * winsz; - - texForTF.filterMode = cudaFilterModeLinear; - texForTF.addressMode[0] = cudaAddressModeWrap; - texForTF.addressMode[1] = cudaAddressModeWrap; - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) ); - + cv::cudev::Texture tex(input, false, cudaFilterModeLinear, cudaAddressModeWrap, cudaReadModeNormalizedFloat); dim3 threads(128, 1, 1); dim3 grid(1, 1, 1); - grid.x = divUp(input.cols, threads.x); grid.y = divUp(input.rows, RpT); - size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float); - textureness_kernel<<>>(disp, winsz, avgTexturenessThreshold); + textureness_kernel<<>>(tex, disp, winsz, avgTexturenessThreshold); cudaSafeCall( cudaGetLastError() ); - if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); - - cudaSafeCall( cudaUnbindTexture (texForTF) ); } } // namespace stereobm }}} // namespace cv { namespace cuda { namespace cudev diff --git a/modules/cudawarping/src/cuda/remap.cu b/modules/cudawarping/src/cuda/remap.cu index 79f155ddfb9..8aeaef2d4e6 100644 --- a/modules/cudawarping/src/cuda/remap.cu +++ b/modules/cudawarping/src/cuda/remap.cu @@ -48,6 +48,7 @@ #include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/saturate_cast.hpp" #include "opencv2/core/cuda/filters.hpp" +#include namespace cv { namespace cuda { namespace device { @@ -108,88 +109,96 @@ namespace cv { namespace cuda { namespace device } }; - #define OPENCV_CUDA_IMPLEMENT_REMAP_TEX(type) \ - texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \ - struct tex_remap_ ## type ## _reader \ - { \ - typedef type elem_type; \ - typedef int index_type; \ - int xoff, yoff; \ - tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \ - __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \ - { \ - return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \ - } \ - }; \ - template