From d738c388e08acfe35e4cc45dec8863238e62ef31 Mon Sep 17 00:00:00 2001
From: James Bowley <12133430+cudawarped@users.noreply.github.com>
Date: Sat, 15 Oct 2022 10:03:00 +0300
Subject: [PATCH] Replace all instances of texture references with texture
 objects using the existing updated cv::cudev::Texture class. Fixes bugs in
 cv::cuda::demosaicing, cv::cuda::resize and cv::cuda::HoughSegmentDetector.

---
 modules/cudaimgproc/src/cuda/canny.cu         | 217 +--------
 modules/cudaimgproc/src/cuda/corners.cu       |  55 +--
 modules/cudaimgproc/src/cuda/debayer.cu       |  48 +-
 .../cudaimgproc/src/cuda/hough_segments.cu    |  25 +-
 modules/cudaimgproc/src/cuda/mean_shift.cu    |  39 +-
 modules/cudaimgproc/test/test_color.cpp       |  18 +-
 modules/cudaimgproc/test/test_hough.cpp       |  64 ++-
 modules/cudaimgproc/test/test_precomp.hpp     |   2 +
 .../include/opencv2/cudalegacy/NCV.hpp        |   4 +-
 .../opencv2/cudalegacy/NPP_staging.hpp        |  10 +-
 .../cudalegacy/src/cuda/NCVBroxOpticalFlow.cu | 363 ++++-----------
 .../src/cuda/NCVHaarObjectDetection.cu        | 352 ++++----------
 modules/cudalegacy/src/cuda/NPP_staging.cu    | 428 +++++------------
 modules/cudalegacy/src/cuda/bm.cu             |  24 +-
 .../cudalegacy/test/TestHypothesesGrow.cpp    |   3 +-
 modules/cudaobjdetect/src/cuda/hog.cu         |  41 +-
 modules/cudaobjdetect/test/test_objdetect.cpp |  13 +-
 modules/cudaoptflow/src/cuda/pyrlk.cu         | 306 +++----------
 modules/cudaoptflow/src/cuda/tvl1flow.cu      | 113 +----
 modules/cudastereo/src/cuda/stereobm.cu       |  39 +-
 modules/cudawarping/src/cuda/remap.cu         | 177 ++++----
 modules/cudawarping/src/cuda/resize.cu        | 108 ++---
 modules/cudawarping/src/cuda/warp.cu          | 149 ++----
 modules/cudawarping/test/test_precomp.hpp     |   2 +
 modules/cudawarping/test/test_resize.cpp      |  54 +++
 .../include/opencv2/cudev/ptr2d/texture.hpp   | 429 +++++++++---------
 modules/xfeatures2d/src/cuda/surf.cu          | 140 +++---
 modules/xfeatures2d/src/surf.cuda.cpp         |  34 +-
 28 files changed, 1065 insertions(+), 2192 deletions(-)
diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu
index 99a4f72a8fe..61ea11ee322 100644
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/utility.hpp"
 #include "opencv2/core/cuda.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 using namespace cv::cuda;
 using namespace cv::cuda::device;
@@ -90,47 +91,8 @@ namespace cv { namespace cuda { namespace device
 
 namespace canny
 {
-    struct SrcTex
-    {
-        virtual ~SrcTex() {}
-
-        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
-
-        __device__ __forceinline__ virtual int operator ()(int y, int x) const = 0;
-
-        int xoff;
-        int yoff;
-    };
-
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
-    struct SrcTexRef : SrcTex
-    {
-        __host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
-
-        __device__ __forceinline__ int operator ()(int y, int x) const override
-        {
-            return tex2D(tex_src, x + xoff, y + yoff);
-        }
-    };
-
-    struct SrcTexObj : SrcTex
-    {
-        __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
-
-        __device__ __forceinline__ int operator ()(int y, int x) const override
-        {
-            return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
-        }
-
-        cudaTextureObject_t tex_src_object;
-    };
-
-    template <
-        class T,
-        class Norm,
-        typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
-    >
-    __global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    template <class Norm>
+    __global__ void calcMagnitudeKernel(cv::cudev::TextureOffPtr<uchar> texSrc, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
     {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -138,8 +100,8 @@ namespace canny
         if (y >= mag.rows || x >= mag.cols)
             return;
 
-        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
-        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+        int dxVal = (texSrc(y - 1, x + 1) + 2 * texSrc(y, x + 1) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y, x - 1) + texSrc(y + 1, x - 1));
+        int dyVal = (texSrc(y + 1, x - 1) + 2 * texSrc(y + 1, x) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y - 1, x) + texSrc(y - 1, x + 1));
 
         dx(y, x) = dxVal;
         dy(y, x) = dyVal;
@@ -151,63 +113,20 @@ namespace canny
     {
         const dim3 block(16, 16);
         const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
-
-        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
-
-        if (cc30)
+        cv::cudev::TextureOff<uchar> texSrc(srcWhole, yoff, xoff);
+        if (L2Grad)
         {
-            cudaTextureDesc texDesc;
-            memset(&texDesc, 0, sizeof(texDesc));
-            texDesc.addressMode[0] = cudaAddressModeClamp;
-            texDesc.addressMode[1] = cudaAddressModeClamp;
-            texDesc.addressMode[2] = cudaAddressModeClamp;
-
-            cudaTextureObject_t tex = 0;
-            createTextureObjectPitch2D(&tex, srcWhole, texDesc);
-
-            SrcTexObj src(xoff, yoff, tex);
-
-            if (L2Grad)
-            {
-                L2 norm;
-                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
-            }
-            else
-            {
-                L1 norm;
-                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == NULL)
-                cudaSafeCall( cudaDeviceSynchronize() );
-            else
-                cudaSafeCall( cudaStreamSynchronize(stream) );
-
-            cudaSafeCall( cudaDestroyTextureObject(tex) );
+            L2 norm;
+            calcMagnitudeKernel << <grid, block, 0, stream >> > (texSrc, dx, dy, mag, norm);
         }
         else
         {
-            bindTexture(&tex_src, srcWhole);
-            SrcTexRef src(xoff, yoff);
-
-            if (L2Grad)
-            {
-                L2 norm;
-                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
-            }
-            else
-            {
-                L1 norm;
-                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == NULL)
-                cudaSafeCall( cudaDeviceSynchronize() );
+            L1 norm;
+            calcMagnitudeKernel << <grid, block, 0, stream >> > (texSrc, dx, dy, mag, norm);
         }
+
+        if (stream == NULL)
+            cudaSafeCall(cudaDeviceSynchronize());
     }
 
     void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
@@ -229,8 +148,7 @@ namespace canny
 
 namespace canny
 {
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
-    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
+    __global__ void calcMapKernel(cv::cudev::TexturePtr<float> texMag, const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
     {
         const int CANNY_SHIFT = 15;
         const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
@@ -245,7 +163,7 @@ namespace canny
         int dyVal = dy(y, x);
 
         const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
-        const float m = tex2D(tex_mag, x, y);
+        const float m = texMag(y, x);
 
         dxVal = ::abs(dxVal);
         dyVal = ::abs(dyVal);
@@ -264,69 +182,17 @@ namespace canny
 
             if (dyVal < tg22x)
             {
-                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
+                if (m > texMag(y, x - 1) && m >= texMag(y, x + 1))
                     edge_type = 1 + (int)(m > high_thresh);
             }
             else if(dyVal > tg67x)
             {
-                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
+                if (m > texMag(y - 1, x) && m >= texMag(y + 1, x))
                     edge_type = 1 + (int)(m > high_thresh);
             }
             else
             {
-                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-        }
-
-        map(y, x) = edge_type;
-    }
-
-    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
-    {
-        const int CANNY_SHIFT = 15;
-        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
-            return;
-
-        int dxVal = dx(y, x);
-        int dyVal = dy(y, x);
-
-        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
-        const float m = tex2D<float>(tex_mag, x, y);
-
-        dxVal = ::abs(dxVal);
-        dyVal = ::abs(dyVal);
-
-        // 0 - the pixel can not belong to an edge
-        // 1 - the pixel might belong to an edge
-        // 2 - the pixel does belong to an edge
-        int edge_type = 0;
-
-        if (m > low_thresh)
-        {
-            const int tg22x = dxVal * TG22;
-            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
-
-            dyVal <<= CANNY_SHIFT;
-
-            if (dyVal < tg22x)
-            {
-                if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else if(dyVal > tg67x)
-            {
-                if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else
-            {
-                if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
+                if (m > texMag(y - 1, x - s) && m >= texMag(y + 1, x + s))
                     edge_type = 1 + (int)(m > high_thresh);
             }
         }
@@ -338,47 +204,10 @@ namespace canny
     {
         const dim3 block(16, 16);
         const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
-
-        if (deviceSupports(FEATURE_SET_COMPUTE_30))
-        {
-            // Use the texture object
-            cudaResourceDesc resDesc;
-            memset(&resDesc, 0, sizeof(resDesc));
-            resDesc.resType = cudaResourceTypePitch2D;
-            resDesc.res.pitch2D.devPtr = mag.ptr();
-            resDesc.res.pitch2D.height = mag.rows;
-            resDesc.res.pitch2D.width = mag.cols;
-            resDesc.res.pitch2D.pitchInBytes = mag.step;
-            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
-
-            cudaTextureDesc texDesc;
-            memset(&texDesc, 0, sizeof(texDesc));
-            texDesc.addressMode[0] = cudaAddressModeClamp;
-            texDesc.addressMode[1] = cudaAddressModeClamp;
-            texDesc.addressMode[2] = cudaAddressModeClamp;
-
-            cudaTextureObject_t tex=0;
-            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
-            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == NULL)
-                cudaSafeCall( cudaDeviceSynchronize() );
-            else
-                cudaSafeCall( cudaStreamSynchronize(stream) );
-
-            cudaSafeCall( cudaDestroyTextureObject(tex) );
-        }
-        else
-        {
-            // Use the texture reference
-            bindTexture(&tex_mag, mag);
-            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == NULL)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
+        cv::cudev::Texture<float> texMag(mag);
+        calcMapKernel<<<grid, block, 0, stream>>>(texMag, dx, dy, map, low_thresh, high_thresh);
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
diff --git a/modules/cudaimgproc/src/cuda/corners.cu b/modules/cudaimgproc/src/cuda/corners.cu
index 92a37e6fde1..2f3452648ca 100644
--- a/modules/cudaimgproc/src/cuda/corners.cu
+++ b/modules/cudaimgproc/src/cuda/corners.cu
@@ -47,6 +47,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 #include "opencv2/opencv_modules.hpp"
 
@@ -58,10 +59,7 @@ namespace cv { namespace cuda { namespace device
     {
         /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
 
-        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
+        __global__ void cornerHarris_kernel(cv::cudev::TexturePtr<float> texDx, cv::cudev::TexturePtr<float> texDy, const int block_size, const float k, PtrStepSzf dst)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -81,8 +79,8 @@ namespace cv { namespace cuda { namespace device
                 {
                     for (int j = jbegin; j < jend; ++j)
                     {
-                        float dx = tex2D(harrisDxTex, j, i);
-                        float dy = tex2D(harrisDyTex, j, i);
+                        float dx = texDx(i, j);
+                        float dy = texDy(i, j);
 
                         a += dx * dx;
                         b += dx * dy;
@@ -95,7 +93,7 @@ namespace cv { namespace cuda { namespace device
         }
 
         template <typename BR, typename BC>
-        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
+        __global__ void cornerHarris_kernel(cv::cudev::TexturePtr<float> texDx, cv::cudev::TexturePtr<float> texDy, const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -119,8 +117,8 @@ namespace cv { namespace cuda { namespace device
                     {
                         const int x = border_row.idx_col(j);
 
-                        float dx = tex2D(harrisDxTex, x, y);
-                        float dy = tex2D(harrisDyTex, x, y);
+                        float dx = texDx(y, x);
+                        float dy = texDy(y, x);
 
                         a += dx * dx;
                         b += dx * dy;
@@ -136,22 +134,20 @@ namespace cv { namespace cuda { namespace device
         {
             dim3 block(32, 8);
             dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-
-            bindTexture(&harrisDxTex, Dx);
-            bindTexture(&harrisDyTex, Dy);
-
+            cv::cudev::Texture<float> texDx(Dx);
+            cv::cudev::Texture<float> texDy(Dy);
             switch (border_type)
             {
             case BORDER_REFLECT101:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
                 break;
 
             case BORDER_REFLECT:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
                 break;
 
             case BORDER_REPLICATE:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst);
                 break;
             }
 
@@ -163,10 +159,7 @@ namespace cv { namespace cuda { namespace device
 
         /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
 
-        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
+        __global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr<float> texMinEigenValDx, cv::cudev::TexturePtr<float> texMinEigenValDy, const int block_size, PtrStepSzf dst)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -186,8 +179,8 @@ namespace cv { namespace cuda { namespace device
                 {
                     for (int j = jbegin; j < jend; ++j)
                     {
-                        float dx = tex2D(minEigenValDxTex, j, i);
-                        float dy = tex2D(minEigenValDyTex, j, i);
+                        float dx = texMinEigenValDx(i, j);
+                        float dy = texMinEigenValDy(i, j);
 
                         a += dx * dx;
                         b += dx * dy;
@@ -204,7 +197,7 @@ namespace cv { namespace cuda { namespace device
 
 
         template <typename BR, typename BC>
-        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
+        __global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr<float> texMinEigenValDx, cv::cudev::TexturePtr<float> texMinEigenValDy, const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -228,8 +221,8 @@ namespace cv { namespace cuda { namespace device
                     {
                         int x = border_row.idx_col(j);
 
-                        float dx = tex2D(minEigenValDxTex, x, y);
-                        float dy = tex2D(minEigenValDyTex, x, y);
+                        float dx = texMinEigenValDx(y, x);
+                        float dy = texMinEigenValDy(y, x);
 
                         a += dx * dx;
                         b += dx * dy;
@@ -248,22 +241,20 @@ namespace cv { namespace cuda { namespace device
         {
             dim3 block(32, 8);
             dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-
-            bindTexture(&minEigenValDxTex, Dx);
-            bindTexture(&minEigenValDyTex, Dy);
-
+            cv::cudev::Texture<float> texMinEigenValDx(Dx);
+            cv::cudev::Texture<float> texMinEigenValDy(Dy);
             switch (border_type)
             {
             case BORDER_REFLECT101:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
                 break;
 
             case BORDER_REFLECT:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
                 break;
 
             case BORDER_REPLICATE:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst);
                 break;
             }
 
diff --git a/modules/cudaimgproc/src/cuda/debayer.cu b/modules/cudaimgproc/src/cuda/debayer.cu
index 0da78139807..1c4ee391421 100644
--- a/modules/cudaimgproc/src/cuda/debayer.cu
+++ b/modules/cudaimgproc/src/cuda/debayer.cu
@@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda/color.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/cudev/ptr2d/texture.hpp"
 
 namespace cv { namespace cuda { namespace device
 {
@@ -389,10 +390,8 @@ namespace cv { namespace cuda { namespace device
     //
     // ported to CUDA
 
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    template <typename DstType>
-    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
+    template <typename DstType, class Ptr2D>
+    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, Ptr2D src, const int2 firstRed)
     {
         const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
         const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
@@ -408,8 +407,8 @@ namespace cv { namespace cuda { namespace device
             return;
 
         int2 center;
-        center.x = x + sourceOffset.x;
-        center.y = y + sourceOffset.y;
+        center.x = x;
+        center.y = y;
 
         int4 xCoord;
         xCoord.x = center.x - 2;
@@ -423,25 +422,26 @@ namespace cv { namespace cuda { namespace device
         yCoord.z = center.y + 1;
         yCoord.w = center.y + 2;
 
-        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
+        float C = src(center.y, center.x); // ( 0, 0)
 
         float4 Dvec;
-        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
-        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
-        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
-        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
+        Dvec.x = src(yCoord.y, xCoord.y); // (-1,-1)
+        Dvec.y = src(yCoord.z, xCoord.y); // (-1, 1)
+        Dvec.z = src(yCoord.y, xCoord.z); // ( 1,-1)
+        Dvec.w = src(yCoord.z, xCoord.z); // ( 1, 1)
+
 
         float4 value;
-        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
-        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
-        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
-        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
+        value.x = src(yCoord.x, center.x); // ( 0,-2) A0
+        value.y = src(yCoord.y, center.x); // ( 0,-1) B0
+        value.z = src(center.y, xCoord.x); // (-2, 0) E0
+        value.w = src(center.y, xCoord.y); // (-1, 0) F0
 
         // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
-        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
-        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
-        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
-        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
+        value.x += src(yCoord.w, center.x); // ( 0, 2) A1
+        value.y += src(yCoord.z, center.x); // ( 0, 1) B1
+        value.z += src(center.y, xCoord.w); // ( 2, 0) E1
+        value.w += src(center.y, xCoord.z); // ( 1, 0) F1
 
         float4 PATTERN;
         PATTERN.x = kCx * C;
@@ -527,9 +527,15 @@ namespace cv { namespace cuda { namespace device
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 
-        bindTexture(&sourceTex, src);
+        if (sourceOffset.x || sourceOffset.y) {
+            cv::cudev::TextureOff<uchar> texSrc(src, sourceOffset.y, sourceOffset.x);
+            MHCdemosaic < dst_t, cv::cudev::TextureOffPtr<uchar> > << <grid, block, 0, stream >> > ((PtrStepSz<dst_t>)dst, texSrc, firstRed);
+        }
+        else {
+            cv::cudev::Texture<uchar> texSrc(src);
+            MHCdemosaic < dst_t, cv::cudev::TexturePtr<uchar> > << <grid, block, 0, stream >> > ((PtrStepSz<dst_t>)dst, texSrc, firstRed);
+        }
 
-        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
         cudaSafeCall( cudaGetLastError() );
 
         if (stream == 0)
diff --git a/modules/cudaimgproc/src/cuda/hough_segments.cu b/modules/cudaimgproc/src/cuda/hough_segments.cu
index 59eb78f6996..4774636ad77 100644
--- a/modules/cudaimgproc/src/cuda/hough_segments.cu
+++ b/modules/cudaimgproc/src/cuda/hough_segments.cu
@@ -50,7 +50,8 @@ namespace cv { namespace cuda { namespace device
 {
     namespace hough_segments
     {
-        __global__ void houghLinesProbabilistic(cv::cudev::Texture<uchar> src, const PtrStepSzi accum,
+        template<class Ptr2D>
+        __global__ void houghLinesProbabilistic(Ptr2D src, const PtrStepSzi accum,
                                                 int4* out, const int maxSize,
                                                 const float rho, const float theta,
                                                 const int lineGap, const int lineLength,
@@ -219,15 +220,18 @@ namespace cv { namespace cuda { namespace device
             const dim3 block(32, 8);
             const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
             
-            cv::cudev::GpuMat_<uchar> src_(mask);
-            cv::cudev::Texture<uchar> tex(src_, false, cudaFilterModePoint, cudaAddressModeClamp);
-
-            houghLinesProbabilistic<<<grid, block, 0, stream>>>(tex, accum,
-                                                     out, maxSize,
-                                                     rho, theta,
-                                                     lineGap, lineLength,
-                                                     mask.rows, mask.cols,
-                                                     counterPtr);
+            Size wholeSize;
+            Point ofs;
+            mask.locateROI(wholeSize, ofs);
+            if (ofs.x || ofs.y) {
+                cv::cudev::TextureOff<uchar> texMask(wholeSize.height, wholeSize.width, mask.datastart, mask.step, ofs.y, ofs.x);
+                houghLinesProbabilistic<cv::cudev::TextureOffPtr<uchar>> << <grid, block, 0, stream >> > (texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr);
+            }
+            else {
+                cv::cudev::Texture<uchar> texMask(mask);
+                houghLinesProbabilistic<cv::cudev::TexturePtr<uchar>> << <grid, block, 0, stream >> > (texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr);
+            }
+
             cudaSafeCall( cudaGetLastError() );
 
             int totalCount;
@@ -236,7 +240,6 @@ namespace cv { namespace cuda { namespace device
             cudaSafeCall( cudaStreamSynchronize(stream) );
 
             totalCount = ::min(totalCount, maxSize);
-
             return totalCount;
         }
     }
diff --git a/modules/cudaimgproc/src/cuda/mean_shift.cu b/modules/cudaimgproc/src/cuda/mean_shift.cu
index 3b3b93f94e4..ef7497be5c8 100644
--- a/modules/cudaimgproc/src/cuda/mean_shift.cu
+++ b/modules/cudaimgproc/src/cuda/mean_shift.cu
@@ -47,19 +47,16 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 namespace cv { namespace cuda { namespace device
 {
     namespace imgproc
     {
-        texture<uchar4, 2> tex_meanshift;
-
-        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
-                                        size_t out_step, int cols, int rows,
-                                        int sp, int sr, int maxIter, float eps)
+        __device__ short2 do_mean_shift(cv::cudev::TexturePtr<uchar4> tex, int x0, int y0, unsigned char* out,size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps)
         {
             int isr2 = sr*sr;
-            uchar4 c = tex2D(tex_meanshift, x0, y0 );
+            uchar4 c = tex(y0, x0);
 
             // iterate meanshift procedure
             for( int iter = 0; iter < maxIter; iter++ )
@@ -79,7 +76,7 @@ namespace cv { namespace cuda { namespace device
                     int rowCount = 0;
                     for( int x = minx; x <= maxx; x++ )
                     {
-                        uchar4 t = tex2D( tex_meanshift, x, y );
+                        uchar4 t = tex(y, x);
 
                         int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
                         if( norm2 <= isr2 )
@@ -119,13 +116,13 @@ namespace cv { namespace cuda { namespace device
             return make_short2((short)x0, (short)y0);
         }
 
-        __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
+        __global__ void meanshift_kernel(cv::cudev::TexturePtr<uchar4> tex, unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
         {
             int x0 = blockIdx.x * blockDim.x + threadIdx.x;
             int y0 = blockIdx.y * blockDim.y + threadIdx.y;
 
             if( x0 < cols && y0 < rows )
-                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
+                do_mean_shift(tex, x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
         }
 
         void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
@@ -134,21 +131,15 @@ namespace cv { namespace cuda { namespace device
             dim3 threads(32, 8, 1);
             grid.x = divUp(src.cols, threads.x);
             grid.y = divUp(src.rows, threads.y);
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
-
-            meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+            cv::cudev::Texture<uchar4> tex(src.rows, src.cols, (uchar4*)src.data, src.step);
+            meanshift_kernel<<< grid, threads, 0, stream >>>( tex, dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
             cudaSafeCall( cudaGetLastError() );
-
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
-                                             unsigned char* outsp, size_t outspstep,
-                                             int cols, int rows,
-                                             int sp, int sr, int maxIter, float eps)
+        __global__ void meanshiftproc_kernel(cv::cudev::TexturePtr<uchar4> tex, unsigned char* outr, size_t outrstep, unsigned char* outsp, size_t outspstep,
+            int cols, int rows,int sp, int sr, int maxIter, float eps)
         {
             int x0 = blockIdx.x * blockDim.x + threadIdx.x;
             int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@@ -156,7 +147,7 @@ namespace cv { namespace cuda { namespace device
             if( x0 < cols && y0 < rows )
             {
                 int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
-                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
+                *(short2*)(outsp + basesp) = do_mean_shift(tex, x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
             }
         }
 
@@ -166,13 +157,9 @@ namespace cv { namespace cuda { namespace device
             dim3 threads(32, 8, 1);
             grid.x = divUp(src.cols, threads.x);
             grid.y = divUp(src.rows, threads.y);
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
-
-            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+            cv::cudev::Texture<uchar4> tex(src.rows, src.cols, (uchar4*)src.data, src.step);
+            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( tex, dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
             cudaSafeCall( cudaGetLastError() );
-
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
diff --git a/modules/cudaimgproc/test/test_color.cpp b/modules/cudaimgproc/test/test_color.cpp
index 97be36a1210..1a8ff1fa0cb 100644
--- a/modules/cudaimgproc/test/test_color.cpp
+++ b/modules/cudaimgproc/test/test_color.cpp
@@ -2294,14 +2294,15 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CvtColor, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Demosaicing
 
-struct Demosaicing : testing::TestWithParam<cv::cuda::DeviceInfo>
+struct Demosaicing : testing::TestWithParam<testing::tuple<cv::cuda::DeviceInfo, bool>>
 {
     cv::cuda::DeviceInfo devInfo;
+    bool useRoi;
 
     virtual void SetUp()
     {
-        devInfo = GetParam();
-
+        devInfo = GET_PARAM(0);
+        useRoi = GET_PARAM(1);
         cv::cuda::setDevice(devInfo.deviceID());
     }
 
@@ -2419,7 +2420,7 @@ CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT)
     mosaic(img, src, cv::Point(1, 1));
 
     cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerBG2BGR_MHT);
+    cv::cuda::demosaicing(loadMat(src,useRoi), dst, cv::cuda::COLOR_BayerBG2BGR_MHT);
 
     EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
@@ -2433,7 +2434,7 @@ CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT)
     mosaic(img, src, cv::Point(0, 1));
 
     cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGB2BGR_MHT);
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGB2BGR_MHT);
 
     EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
@@ -2447,7 +2448,7 @@ CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT)
     mosaic(img, src, cv::Point(0, 0));
 
     cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerRG2BGR_MHT);
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerRG2BGR_MHT);
 
     EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
@@ -2461,12 +2462,11 @@ CUDA_TEST_P(Demosaicing, BayerGR2BGR_MHT)
     mosaic(img, src, cv::Point(1, 0));
 
     cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGR2BGR_MHT);
-
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGR2BGR_MHT);
     EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
 
-INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, testing::Combine(ALL_DEVICES, WHOLE_SUBMAT));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // swapChannels
diff --git a/modules/cudaimgproc/test/test_hough.cpp b/modules/cudaimgproc/test/test_hough.cpp
index e6a05f578f6..023e1c50c7d 100644
--- a/modules/cudaimgproc/test/test_hough.cpp
+++ b/modules/cudaimgproc/test/test_hough.cpp
@@ -115,8 +115,20 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLines, testing::Combine(
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughLines Probabilistic
-PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+PARAM_TEST_CASE(HoughLinesProbabilistic, DeviceInfo, Size, UseRoi)
 {
+    cv::cuda::DeviceInfo devInfo;
+    bool useRoi;
+    Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+
     static void generateLines(cv::Mat& img)
     {
         img.setTo(cv::Scalar::all(0));
@@ -140,11 +152,6 @@ PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi)
 
 CUDA_TEST_P(HoughLinesProbabilistic, Accuracy)
 {
-    const cv::cuda::DeviceInfo devInfo = GET_PARAM(0);
-    cv::cuda::setDevice(devInfo.deviceID());
-    const cv::Size size = GET_PARAM(1);
-    const bool useRoi = GET_PARAM(2);
-
     const float rho = 1.0f;
     const float theta = (float) (1.0 * CV_PI / 180.0);
     const int minLineLength = 15;
@@ -169,12 +176,55 @@ CUDA_TEST_P(HoughLinesProbabilistic, Accuracy)
 
 }
 
+void HoughLinesProbabilisticThread(const Ptr<HoughSegmentDetector> detector, const GpuMat& imgIn, const std::vector<GpuMat>& linesOut, Stream& stream) {
+    for (auto& lines : linesOut)
+        detector->detect(imgIn, lines, stream);
+    stream.waitForCompletion();
+}
+
+CUDA_TEST_P(HoughLinesProbabilistic, Async)
+{
+    constexpr int nThreads = 5;
+    constexpr int nIters = 5;
+    vector<Stream> streams(nThreads); // async test only
+    vector<GpuMat> imgsIn;
+    vector<Ptr<HoughSegmentDetector>> detectors;
+    vector<vector<GpuMat>> linesOut(nThreads);
+    const float rho = 1.0f;
+    const float theta = (float)(1.0 * CV_PI / 180.0);
+    const int minLineLength = 15;
+    const int maxLineGap = 8;
+
+    cv::Mat src(size, CV_8UC1);
+    generateLines(src);
+
+    for (int i = 0; i < nThreads; i++) {
+        imgsIn.push_back(loadMat(src, useRoi));
+        detectors.push_back(createHoughSegmentDetector(rho, theta, minLineLength, maxLineGap));
+        linesOut.push_back(vector<GpuMat>(nIters));
+    }
+
+    vector<std::thread> thread(nThreads);
+    for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(HoughLinesProbabilisticThread, detectors.at(i), std::ref(imgsIn.at(i)), std::ref(linesOut.at(i)), std::ref(streams.at(i)));
+    for (int i = 0; i < nThreads; i++) thread.at(i).join();
+
+    for (int i = 0; i < nThreads; i++) {
+        std::vector<cv::Vec4i> linesSegment;
+        std::vector<cv::Vec2f> lines;
+        for (const auto& line : linesOut.at(i)) {
+            line.download(linesSegment);
+            cv::Mat dst(size, CV_8UC1);
+            drawLines(dst, linesSegment);
+            ASSERT_MAT_NEAR(src, dst, 0.0);
+        }
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLinesProbabilistic, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     WHOLE_SUBMAT));
 
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughCircles
 
diff --git a/modules/cudaimgproc/test/test_precomp.hpp b/modules/cudaimgproc/test/test_precomp.hpp
index dd94f6f2856..e388fbdaa8a 100644
--- a/modules/cudaimgproc/test/test_precomp.hpp
+++ b/modules/cudaimgproc/test/test_precomp.hpp
@@ -49,4 +49,6 @@
 
 #include "cvconfig.h"
 
+#include <thread>
+
 #endif
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
index d0ec6a42d6e..f03410dfbd0 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
@@ -119,9 +119,9 @@ typedef               bool NcvBool;
 typedef          long long Ncv64s;
 
 #if defined(__APPLE__) && !defined(__CUDACC__)
-    typedef uint64_t Ncv64u;
+    typedef uint64 Ncv64u;
 #else
-    typedef unsigned long long Ncv64u;
+    typedef uint64 Ncv64u;
 #endif
 
 typedef                int Ncv32s;
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
index 89e7f7cdea3..d9189eb20bb 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
@@ -174,7 +174,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState);
  * \return NCV status code
  */
 CV_EXPORTS
-NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc,
                                         NcvSize32u srcSize,
                                         Ncv32u nSrcStep,
                                         Ncv32f *pDst,
@@ -182,7 +182,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
                                         Ncv32u nDstStep,
                                         NcvRect32u oROI,
                                         NppStBorderType borderType,
-                                        const Ncv32f *pKernel,
+                                        Ncv32f *pKernel,
                                         Ncv32s nKernelSize,
                                         Ncv32s nAnchor,
                                         Ncv32f multiplier);
@@ -208,7 +208,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
  * \return NCV status code
  */
 CV_EXPORTS
-NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc,
                                            NcvSize32u srcSize,
                                            Ncv32u nSrcStep,
                                            Ncv32f *pDst,
@@ -216,7 +216,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
                                            Ncv32u nDstStep,
                                            NcvRect32u oROI,
                                            NppStBorderType borderType,
-                                           const Ncv32f *pKernel,
+                                           Ncv32f *pKernel,
                                            Ncv32s nKernelSize,
                                            Ncv32s nAnchor,
                                            Ncv32f multiplier);
@@ -319,7 +319,7 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
  * \return NCV status code
  */
 CV_EXPORTS
-NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc,
                                NcvSize32u srcSize,
                                Ncv32u nSrcStep,
                                NcvRect32u srcROI,
diff --git a/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
index 01914880248..a7f83c715d0 100644
--- a/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
+++ b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
@@ -65,9 +65,12 @@
 
 #include "opencv2/cudalegacy/NPP_staging.hpp"
 #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 
 typedef NCVVectorAlloc<Ncv32f> FloatVector;
+typedef cv::cudev::TexturePtr<float> Ptr2D;
+typedef cv::cudev::Texture<float> Texture;
 
 /////////////////////////////////////////////////////////////////////////////////////////
 // Implementation specific constants
@@ -84,39 +87,6 @@ inline int iDivUp(int a, int b)
     return (a + b - 1)/b;
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////
-// Texture references
-/////////////////////////////////////////////////////////////////////////////////////////
-
-texture<float, 2, cudaReadModeElementType> tex_coarse;
-texture<float, 2, cudaReadModeElementType> tex_fine;
-
-texture<float, 2, cudaReadModeElementType> tex_I1;
-texture<float, 2, cudaReadModeElementType> tex_I0;
-
-texture<float, 2, cudaReadModeElementType> tex_Ix;
-texture<float, 2, cudaReadModeElementType> tex_Ixx;
-texture<float, 2, cudaReadModeElementType> tex_Ix0;
-
-texture<float, 2, cudaReadModeElementType> tex_Iy;
-texture<float, 2, cudaReadModeElementType> tex_Iyy;
-texture<float, 2, cudaReadModeElementType> tex_Iy0;
-
-texture<float, 2, cudaReadModeElementType> tex_Ixy;
-
-texture<float, 1, cudaReadModeElementType> tex_u;
-texture<float, 1, cudaReadModeElementType> tex_v;
-texture<float, 1, cudaReadModeElementType> tex_du;
-texture<float, 1, cudaReadModeElementType> tex_dv;
-texture<float, 1, cudaReadModeElementType> tex_numerator_dudv;
-texture<float, 1, cudaReadModeElementType> tex_numerator_u;
-texture<float, 1, cudaReadModeElementType> tex_numerator_v;
-texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u;
-texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v;
-texture<float, 1, cudaReadModeElementType> tex_diffusivity_x;
-texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
-
-
 /////////////////////////////////////////////////////////////////////////////////////////
 // SUPPLEMENTARY FUNCTIONS
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -265,8 +235,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo
 ///\param h number of rows in global memory array
 ///\param p global memory array pitch in floats
 ///////////////////////////////////////////////////////////////////////////////
-template<int tex_id>
-__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
+__forceinline__ __device__ void load_array_element(Ptr2D texSrc, float *smem, int is, int js, int i, int j, int w, int h, int p)
 {
     //position within shared memory array
     const int ijs = js * PSOR_PITCH + is;
@@ -276,20 +245,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
     j = max(j, -j-1);
     j = min(j, h-j+h-1);
     const int pos = j * p + i;
-    switch(tex_id){
-        case 0:
-            smem[ijs] = tex1Dfetch(tex_u, pos);
-            break;
-        case 1:
-            smem[ijs] = tex1Dfetch(tex_v, pos);
-            break;
-        case 2:
-            smem[ijs] = tex1Dfetch(tex_du, pos);
-            break;
-        case 3:
-            smem[ijs] = tex1Dfetch(tex_dv, pos);
-            break;
-    }
+    smem[ijs] = texSrc(pos);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -301,49 +257,48 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
 ///\param h number of rows in global memory array
 ///\param p global memory array pitch in floats
 ///////////////////////////////////////////////////////////////////////////////
-template<int tex>
-__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
+__forceinline__ __device__ void load_array(Ptr2D texSrc, float *smem, int ig, int jg, int w, int h, int p)
 {
     const int i = threadIdx.x + 2;
     const int j = threadIdx.y + 2;
-    load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel
+    load_array_element(texSrc, smem, i, j, ig, jg, w, h, p);//load current pixel
     __syncthreads();
     if(threadIdx.y < 2)
     {
         //load bottom shadow elements
-        load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p);
+        load_array_element(texSrc, smem, i, j-2, ig, jg-2, w, h, p);
         if(threadIdx.x < 2)
         {
             //load bottom right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
             //load middle right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
         }
         else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
         {
             //load bottom left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
             //load middle left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
         }
     }
     else if(threadIdx.y >= PSOR_TILE_HEIGHT-2)
     {
         //load upper shadow elements
-        load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p);
+        load_array_element(texSrc, smem, i, j+2, ig, jg+2, w, h, p);
         if(threadIdx.x < 2)
         {
             //load upper right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
             //load middle right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
         }
         else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
         {
             //load upper left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
             //load middle left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
         }
     }
     else
@@ -352,12 +307,12 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
         if(threadIdx.x < 2)
         {
             //load middle right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
         }
         else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
         {
             //load middle left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
         }
     }
     __syncthreads();
@@ -382,13 +337,9 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
 /// \param alpha (in) alpha in Brox model (flow smoothness)
 /// \param gamma (in) gamma in Brox model (edge importance)
 ///////////////////////////////////////////////////////////////////////////////
-
-__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
-                                                        float *denominator_u, float *denominator_v,
-                                                        float *numerator_dudv,
-                                                        float *numerator_u, float *numerator_v,
-                                                        int w, int h, int s,
-                                                        float alpha, float gamma)
+__global__ void prepare_sor_stage_1_tex(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texI0, Ptr2D texI1, Ptr2D texIx, Ptr2D texIxx, Ptr2D texIx0, Ptr2D texIy, Ptr2D texIyy,
+    Ptr2D texIy0, Ptr2D texIxy, float *diffusivity_x, float *diffusivity_y, float *denominator_u, float *denominator_v, float *numerator_dudv, float *numerator_u, float *numerator_v,
+    int w, int h, int s, float alpha, float gamma)
 {
     __shared__ float u[PSOR_PITCH * PSOR_HEIGHT];
     __shared__ float v[PSOR_PITCH * PSOR_HEIGHT];
@@ -408,24 +359,24 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity
     float x = (float)ig + 0.5f;
     float y = (float)jg + 0.5f;
     //load u  and v to smem
-    load_array<0>(u, ig, jg, w, h, s);
-    load_array<1>(v, ig, jg, w, h, s);
-    load_array<2>(du, ig, jg, w, h, s);
-    load_array<3>(dv, ig, jg, w, h, s);
+    load_array(texU, u, ig, jg, w, h, s);
+    load_array(texV, v, ig, jg, w, h, s);
+    load_array(texDu, du, ig, jg, w, h, s);
+    load_array(texDv, dv, ig, jg, w, h, s);
     //warped position
     float wx = (x + u[ijs])/(float)w;
     float wy = (y + v[ijs])/(float)h;
     x /= (float)w;
     y /= (float)h;
     //compute image derivatives
-    const float Iz  = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y);
-    const float Ix  = tex2D(tex_Ix, wx, wy);
-    const float Ixz = Ix - tex2D(tex_Ix0, x, y);
-    const float Ixy = tex2D(tex_Ixy, wx, wy);
-    const float Ixx = tex2D(tex_Ixx, wx, wy);
-    const float Iy  = tex2D(tex_Iy, wx, wy);
-    const float Iyz = Iy - tex2D(tex_Iy0, x, y);
-    const float Iyy = tex2D(tex_Iyy, wx, wy);
+    const float Iz = texI1(wy, wx) - texI0(y,x);
+    const float Ix  = texIx(wy, wx);
+    const float Ixz = Ix - texIx0(y, x);
+    const float Ixy = texIxy(wy, wx);
+    const float Ixx = texIxx(wy, wx);
+    const float Iy = texIy(wy, wx);
+    const float Iyz = Iy - texIy0(y, x);
+    const float Iyy = texIyy(wy, wx);
     //compute data term
     float q0, q1, q2;
     q0 = Iz  + Ix  * du[ijs] + Iy  * dv[ijs];
@@ -462,8 +413,7 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity
 ///\param h
 ///\param s
 ///////////////////////////////////////////////////////////////////////////////
-__global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v,
-                                    int w, int h, int s)
+__global__ void prepare_sor_stage_2(Ptr2D texDiffX, Ptr2D texDiffY, float *inv_denominator_u, float *inv_denominator_v, int w, int h, int s)
 {
     __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
     __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
@@ -486,8 +436,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
     }
     if(inside)
     {
-        sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg);
-        sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg);
+        sx[ijs] = texDiffX(ijg);
+        sy[ijs] = texDiffY(ijg);
     }
     else
     {
@@ -498,25 +448,17 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
     if(j == PSOR_TILE_HEIGHT-1)
     {
         if(jg < h-1 && inside)
-        {
-            sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s);
-        }
+            sy[up] = texDiffY(ijg + s);
         else
-        {
             sy[up] = 0.0f;
-        }
     }
     int right = ijs + 1;
     if(threadIdx.x == PSOR_TILE_WIDTH-1)
     {
         if(ig < w-1 && inside)
-        {
-            sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1);
-        }
+            sx[right] = texDiffX(ijg + 1);
         else
-        {
             sx[right] = 0.0f;
-        }
     }
     __syncthreads();
     float diffusivity_sum;
@@ -534,17 +476,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
 // Red-Black SOR
 /////////////////////////////////////////////////////////////////////////////////////////
 
-template<int isBlack> __global__ void sor_pass(float *new_du,
-                                               float *new_dv,
-                                               const float *g_inv_denominator_u,
-                                               const float *g_inv_denominator_v,
-                                               const float *g_numerator_u,
-                                               const float *g_numerator_v,
-                                               const float *g_numerator_dudv,
-                                               float omega,
-                                               int width,
-                                               int height,
-                                               int stride)
+template<int isBlack> __global__ void sor_pass(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texDiffX, Ptr2D texDiffY, float *new_du, float *new_dv, const float *g_inv_denominator_u,
+    const float *g_inv_denominator_v, const float *g_numerator_u, const float *g_numerator_v, const float *g_numerator_dudv, float omega, int width, int height, int stride)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     int j = blockIdx.y * blockDim.y + threadIdx.y;
@@ -560,14 +493,14 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
 
     //load smooth term
     float s_up, s_left, s_right, s_down;
-    s_left = tex1Dfetch(tex_diffusivity_x, pos);
-    s_down = tex1Dfetch(tex_diffusivity_y, pos);
+    s_left = texDiffX(pos);
+    s_down = texDiffY(pos);
     if(i < width-1)
-        s_right = tex1Dfetch(tex_diffusivity_x, pos_r);
+        s_right = texDiffX(pos_r);
     else
         s_right = 0.0f; //Neumann BC
     if(j < height-1)
-        s_up = tex1Dfetch(tex_diffusivity_y, pos_u);
+        s_up = texDiffY(pos_u);
     else
         s_up = 0.0f; //Neumann BC
 
@@ -577,30 +510,29 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
     float du_up, du_left, du_right, du_down, du;
     float dv_up, dv_left, dv_right, dv_down, dv;
 
-    u_left  = tex1Dfetch(tex_u, pos_l);
-    u_right = tex1Dfetch(tex_u, pos_r);
-    u_down  = tex1Dfetch(tex_u, pos_d);
-    u_up    = tex1Dfetch(tex_u, pos_u);
-    u       = tex1Dfetch(tex_u, pos);
-
-    v_left  = tex1Dfetch(tex_v, pos_l);
-    v_right = tex1Dfetch(tex_v, pos_r);
-    v_down  = tex1Dfetch(tex_v, pos_d);
-    v       = tex1Dfetch(tex_v, pos);
-    v_up    = tex1Dfetch(tex_v, pos_u);
-
-    du       = tex1Dfetch(tex_du, pos);
-    du_left  = tex1Dfetch(tex_du, pos_l);
-    du_right = tex1Dfetch(tex_du, pos_r);
-    du_down  = tex1Dfetch(tex_du, pos_d);
-    du_up    = tex1Dfetch(tex_du, pos_u);
-
-    dv       = tex1Dfetch(tex_dv, pos);
-    dv_left  = tex1Dfetch(tex_dv, pos_l);
-    dv_right = tex1Dfetch(tex_dv, pos_r);
-    dv_down  = tex1Dfetch(tex_dv, pos_d);
-    dv_up    = tex1Dfetch(tex_dv, pos_u);
-
+    u_left = texU(pos_l);
+    u_right = texU(pos_r);
+    u_down = texU(pos_d);
+    u_up = texU(pos_u);
+    u = texU(pos);
+
+    v_left = texV(pos_l);
+    v_right = texV(pos_r);
+    v_down = texV(pos_d);
+    v = texV(pos);
+    v_up = texV(pos_u);
+
+    du =  texDu(pos);
+    du_left = texDu(pos_l);
+    du_right = texDu(pos_r);
+    du_down = texDu(pos_d);
+    du_up = texDu(pos_u);
+
+    dv = texDv(pos);
+    dv_left = texDv(pos_l);
+    dv_right = texDv(pos_r);
+    dv_down = texDv(pos_d);
+    dv_up = texDv(pos_u);
     float numerator_dudv    = g_numerator_dudv[pos];
 
     if((i+j)%2 == isBlack)
@@ -624,52 +556,6 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
 ///////////////////////////////////////////////////////////////////////////////
 // utility functions
 ///////////////////////////////////////////////////////////////////////////////
-
-void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex)
-{
-    tex.addressMode[0] = cudaAddressModeClamp;
-    tex.filterMode = cudaFilterModePoint;
-    tex.normalized = false;
-}
-
-void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex)
-{
-    tex.addressMode[0] = cudaAddressModeMirror;
-    tex.addressMode[1] = cudaAddressModeMirror;
-    tex.filterMode = cudaFilterModeLinear;
-    tex.normalized = true;
-}
-
-void InitTextures()
-{
-    initTexture2D(tex_I0);
-    initTexture2D(tex_I1);
-    initTexture2D(tex_fine);      // for downsampling
-    initTexture2D(tex_coarse);    // for prolongation
-
-    initTexture2D(tex_Ix);
-    initTexture2D(tex_Ixx);
-    initTexture2D(tex_Ix0);
-
-    initTexture2D(tex_Iy);
-    initTexture2D(tex_Iyy);
-    initTexture2D(tex_Iy0);
-
-    initTexture2D(tex_Ixy);
-
-    initTexture1D(tex_u);
-    initTexture1D(tex_v);
-    initTexture1D(tex_du);
-    initTexture1D(tex_dv);
-    initTexture1D(tex_diffusivity_x);
-    initTexture1D(tex_diffusivity_y);
-    initTexture1D(tex_inv_denominator_u);
-    initTexture1D(tex_inv_denominator_v);
-    initTexture1D(tex_numerator_dudv);
-    initTexture1D(tex_numerator_u);
-    initTexture1D(tex_numerator_v);
-}
-
 namespace
 {
     struct ImagePyramid
@@ -804,8 +690,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
 
         ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
             cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
-
-        InitTextures();
     }
 
     //prepare image pyramid
@@ -909,9 +793,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
         ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
 
         //select images with lowest resolution
-        size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
-        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
-        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
         ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
 
         FloatVector* ptrU = &u;
@@ -941,17 +822,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
             ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
             ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
 
-            //texture format descriptor
-            cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc<float>();
-
             I0 = *img0Iter;
             I1 = *img1Iter;
 
             ++img0Iter;
             ++img1Iter;
 
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
+            Texture texI0(kLevelHeight, kLevelWidth, I0->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texI1(kLevelHeight, kLevelWidth, I1->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror);
 
             //compute derivatives
             dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
@@ -991,20 +869,24 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
             ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
 
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy,  Iy.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            Texture texIx(kLevelHeight, kLevelWidth, Ix.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIxx(kLevelHeight, kLevelWidth, Ixx.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIx0(kLevelHeight, kLevelWidth, Ix0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIy(kLevelHeight, kLevelWidth, Iy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIyy(kLevelHeight, kLevelWidth, Iyy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIy0(kLevelHeight, kLevelWidth, Iy0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIxy(kLevelHeight, kLevelWidth, Ixy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texDiffX(1, kLevelSizeInBytes / sizeof(float), diffusivity_x.ptr(), kLevelSizeInBytes);
+            Texture texDiffY(1, kLevelSizeInBytes / sizeof(float), diffusivity_y.ptr(), kLevelSizeInBytes);
 
             //    flow
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            Texture texU(1, kLevelSizeInBytes / sizeof(float), ptrU->ptr(), kLevelSizeInBytes);
+            Texture texV(1, kLevelSizeInBytes / sizeof(float), ptrV->ptr(), kLevelSizeInBytes);
             //    flow increments
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            Texture texDu(1, kLevelSizeInBytes / sizeof(float), du.ptr(), kLevelSizeInBytes);
+            Texture texDv(1, kLevelSizeInBytes / sizeof(float), dv.ptr(), kLevelSizeInBytes);
+            Texture texDuNew(1, kLevelSizeInBytes / sizeof(float), du_new.ptr(), kLevelSizeInBytes);
+            Texture texDvNew(1, kLevelSizeInBytes / sizeof(float), dv_new.ptr(), kLevelSizeInBytes);
 
             dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
             dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
@@ -1018,89 +900,30 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
             for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
             {
                 //compute coefficients
-                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
-                    (diffusivity_x.ptr(),
-                     diffusivity_y.ptr(),
-                     denom_u.ptr(),
-                     denom_v.ptr(),
-                     num_dudv.ptr(),
-                     num_u.ptr(),
-                     num_v.ptr(),
-                     kLevelWidth,
-                     kLevelHeight,
-                     kLevelStride,
-                     alpha,
-                     gamma);
+                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>> (texU, texV, texDu, texDv, texI0, texI1, texIx, texIxx, texIx0, texIy, texIyy, texIy0, texIxy,
+                    diffusivity_x.ptr(), diffusivity_y.ptr(), denom_u.ptr(), denom_v.ptr(), num_dudv.ptr(), num_u.ptr(), num_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride, alpha, gamma);
 
                 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
 
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
+                prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(texDiffX, texDiffY, denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
 
                 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
 
-                //    linear system coefficients
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
 
                 //solve linear system
                 for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
                 {
                     float omega = 1.99f;
-
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                    sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
-                        (du_new.ptr(),
-                        dv_new.ptr(),
-                        denom_u.ptr(),
-                        denom_v.ptr(),
-                        num_u.ptr(),
-                        num_v.ptr(),
-                        num_dudv.ptr(),
-                        omega,
-                        kLevelWidth,
-                        kLevelHeight,
-                        kLevelStride);
+                    sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>(texU, texV, texDu, texDv, texDiffX, texDiffY, du_new.ptr(), dv_new.ptr(), denom_u.ptr(), denom_v.ptr(),
+                        num_u.ptr(), num_v.ptr(), num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride);
 
                     ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
 
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                    sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
-                        (du.ptr(),
-                        dv.ptr(),
-                        denom_u.ptr(),
-                        denom_v.ptr(),
-                        num_u.ptr(),
-                        num_v.ptr(),
-                        num_dudv.ptr(),
-                        omega,
-                        kLevelWidth,
-                        kLevelHeight,
-                        kLevelStride);
 
-                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+                    sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>(texU, texV, texDuNew, texDvNew, texDiffX, texDiffY, du.ptr(), dv.ptr(), denom_u.ptr(), denom_v.ptr(), num_u.ptr(),
+                        num_v.ptr(),num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride);
 
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
                 }//end of solver loop
             }// end of inner loop
 
diff --git a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
index 57506173f50..9760bcee523 100644
--- a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
+++ b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
@@ -72,6 +72,7 @@
 #include "opencv2/cudalegacy/NCV.hpp"
 #include "opencv2/cudalegacy/NPP_staging.hpp"
 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 #include "NCVRuntimeTemplates.hpp"
 #include "NCVAlg.hpp"
@@ -94,24 +95,6 @@ const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
 #define NUM_THREADS_CLASSIFIERPARALLEL          (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
 
 
-/** \internal
-* Haar features solid array.
-*/
-texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
-
-
-/** \internal
-* Haar classifiers flattened trees container.
-* Two parts: first contains root nodes, second - nodes that are referred by root nodes.
-* Drawback: breaks tree locality (might cause more cache misses
-* Advantage: No need to introduce additional 32-bit field to index root nodes offsets
-*/
-texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
-
-
-texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
-
-
 __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
 {
     return d_Stages[iStage];
@@ -119,51 +102,37 @@ __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
 
 
 template <NcvBool tbCacheTextureCascade>
-__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
+__device__ HaarClassifierNode128 getClassifierNode(cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
 {
     HaarClassifierNode128 tmpNode;
     if (tbCacheTextureCascade)
-    {
-        tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
-    }
+        tmpNode._ui4 = texHaarClassifierNodes(iNode);
     else
-    {
         tmpNode = d_ClassifierNodes[iNode];
-    }
     return tmpNode;
 }
 
 
 template <NcvBool tbCacheTextureCascade>
-__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
-                           Ncv32f *weight,
-                           Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
+__device__ void getFeature(cv::cudev::TexturePtr<uint2> texHaarFeatures, Ncv32u iFeature, HaarFeature64* d_Features, Ncv32f* weight, Ncv32u* rectX, Ncv32u* rectY, Ncv32u* rectWidth, Ncv32u* rectHeight)
 {
     HaarFeature64 feature;
     if (tbCacheTextureCascade)
-    {
-        feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
-    }
+        feature._ui2 = texHaarFeatures(iFeature);
     else
-    {
         feature = d_Features[iFeature];
-    }
     feature.getRect(rectX, rectY, rectWidth, rectHeight);
     *weight = feature.getWeight();
 }
 
 
 template <NcvBool tbCacheTextureIImg>
-__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
+__device__ Ncv32u getElemIImg(cv::cudev::TexturePtr<Ncv32u> texImg, Ncv32u x, Ncv32u *d_IImg)
 {
     if (tbCacheTextureIImg)
-    {
-        return tex1Dfetch(texIImage, x);
-    }
+        return texImg(x);
     else
-    {
         return d_IImg[x];
-    }
 }
 
 
@@ -203,17 +172,10 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
 }
 
 
-template <NcvBool tbInitMaskPositively,
-          NcvBool tbCacheTextureIImg,
-          NcvBool tbCacheTextureCascade,
-          NcvBool tbReadPixelIndexFromVector,
-          NcvBool tbDoAtomicCompaction>
-__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
-                                                  Ncv32f *d_weights, Ncv32u weightsStride,
-                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
-                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,
-                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,
-                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
+template <NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction>
+__global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes,
+    Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask,
+    Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride,  NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
 {
     Ncv32u y_offs;
     Ncv32u x_offs;
@@ -299,7 +261,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
                 {
                     while (bMoreNodesToTraverse)
                     {
-                        HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
+                        HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(texHaarClassifierNodes, iNode, d_ClassifierNodes);
                         HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
                         Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
                         Ncv32u iFeature = featuresDesc.getFeaturesOffset();
@@ -310,19 +272,17 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
                         {
                             Ncv32f rectWeight;
                             Ncv32u rectX, rectY, rectWidth, rectHeight;
-                            getFeature<tbCacheTextureCascade>
-                                (iFeature + iRect, d_Features,
-                                &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
+                            getFeature<tbCacheTextureCascade> (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
 
                             Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
                             Ncv32u iioffsTR = iioffsTL + rectWidth;
                             Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
                             Ncv32u iioffsBR = iioffsBL + rectWidth;
 
-                            Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
-                                             getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
-                                             getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
-                                             getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
+                            Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(texImg, iioffsBR, d_IImg) -
+                                             getElemIImg<tbCacheTextureIImg>(texImg, iioffsBL, d_IImg) +
+                                             getElemIImg<tbCacheTextureIImg>(texImg, iioffsTL, d_IImg) -
+                                             getElemIImg<tbCacheTextureIImg>(texImg, iioffsTR, d_IImg);
 
     #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
                         curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
@@ -393,15 +353,10 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
 }
 
 
-template <NcvBool tbCacheTextureIImg,
-          NcvBool tbCacheTextureCascade,
-          NcvBool tbDoAtomicCompaction>
-__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
-                                                      Ncv32f *d_weights, Ncv32u weightsStride,
-                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
-                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
-                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
-                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
+template <NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction>
+__global__ void applyHaarClassifierClassifierParallel(cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg,
+    Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask,
+    Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
 {
     Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
 
@@ -439,7 +394,7 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
 
                 while (bMoreNodesToTraverse)
                 {
-                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
+                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(texHaarClassifierNodes, iNode, d_ClassifierNodes);
                     HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
                     Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
                     Ncv32u iFeature = featuresDesc.getFeaturesOffset();
@@ -450,19 +405,17 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
                     {
                         Ncv32f rectWeight;
                         Ncv32u rectX, rectY, rectWidth, rectHeight;
-                        getFeature<tbCacheTextureCascade>
-                            (iFeature + iRect, d_Features,
-                            &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
+                        getFeature<tbCacheTextureCascade> (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
 
                         Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
                         Ncv32u iioffsTR = iioffsTL + rectWidth;
                         Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
                         Ncv32u iioffsBR = iioffsBL + rectWidth;
 
-                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
-                                         getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
-                                         getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
-                                         getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
+                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(texImg, iioffsBR, d_IImg) -
+                                         getElemIImg<tbCacheTextureIImg>(texImg, iioffsBL, d_IImg) +
+                                         getElemIImg<tbCacheTextureIImg>(texImg, iioffsTL, d_IImg) -
+                                         getElemIImg<tbCacheTextureIImg>(texImg, iioffsTR, d_IImg);
 
 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
                         curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
@@ -578,8 +531,9 @@ struct applyHaarClassifierAnchorParallelFunctor
 {
     dim3 gridConf, blockConf;
     cudaStream_t cuStream;
-
-    //Kernel arguments are stored as members;
+    cv::cudev::TexturePtr<Ncv32u> texImg;
+    cv::cudev::TexturePtr<uint2> texHaarFeatures;
+    cv::cudev::TexturePtr<uint4> texHaarClassifierNodes;
     Ncv32u *d_IImg;
     Ncv32u IImgStride;
     Ncv32f *d_weights;
@@ -597,32 +551,12 @@ struct applyHaarClassifierAnchorParallelFunctor
     Ncv32f scaleArea;
 
     //Arguments are passed through the constructor
-    applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
-                                             Ncv32u *_d_IImg, Ncv32u _IImgStride,
-                                             Ncv32f *_d_weights, Ncv32u _weightsStride,
-                                             HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
-                                             Ncv32u *_d_inMask, Ncv32u *_d_outMask,
-                                             Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
-                                             NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
-                                             Ncv32u _endStageExc, Ncv32f _scaleArea) :
-    gridConf(_gridConf),
-    blockConf(_blockConf),
-    cuStream(_cuStream),
-    d_IImg(_d_IImg),
-    IImgStride(_IImgStride),
-    d_weights(_d_weights),
-    weightsStride(_weightsStride),
-    d_Features(_d_Features),
-    d_ClassifierNodes(_d_ClassifierNodes),
-    d_Stages(_d_Stages),
-    d_inMask(_d_inMask),
-    d_outMask(_d_outMask),
-    mask1Dlen(_mask1Dlen),
-    mask2Dstride(_mask2Dstride),
-    anchorsRoi(_anchorsRoi),
-    startStageInc(_startStageInc),
-    endStageExc(_endStageExc),
-    scaleArea(_scaleArea)
+    applyHaarClassifierAnchorParallelFunctor(cv::cudev::TexturePtr<Ncv32u> texImg_, cv::cudev::TexturePtr<uint2> texHaarFeatures_, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes_, dim3 _gridConf,
+        dim3 _blockConf, cudaStream_t _cuStream, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes,
+        HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi, Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) :
+        gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_), texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride),
+        d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features), d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen),
+        mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi), startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea)
     {}
 
     template<class TList>
@@ -635,43 +569,19 @@ struct applyHaarClassifierAnchorParallelFunctor
             Loki::TL::TypeAt<TList, 2>::Result::value,
             Loki::TL::TypeAt<TList, 3>::Result::value,
             Loki::TL::TypeAt<TList, 4>::Result::value >
-            <<<gridConf, blockConf, 0, cuStream>>>
-            (d_IImg, IImgStride,
-            d_weights, weightsStride,
-            d_Features, d_ClassifierNodes, d_Stages,
-            d_inMask, d_outMask,
-            mask1Dlen, mask2Dstride,
-            anchorsRoi, startStageInc,
-            endStageExc, scaleArea);
+            <<<gridConf, blockConf, 0, cuStream>>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask,
+                d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
     }
 };
 
 
-void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
-                                                  NcvBool tbCacheTextureIImg,
-                                                  NcvBool tbCacheTextureCascade,
-                                                  NcvBool tbReadPixelIndexFromVector,
-                                                  NcvBool tbDoAtomicCompaction,
-
-                                                  dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
-
-                                                  Ncv32u *d_IImg, Ncv32u IImgStride,
-                                                  Ncv32f *d_weights, Ncv32u weightsStride,
-                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
-                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,
-                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,
-                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc,
-                                                  Ncv32u endStageExc, Ncv32f scaleArea)
+void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction,
+    dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg,
+    Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask,
+    Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
 {
-
-    applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
-                                                     d_IImg, IImgStride,
-                                                     d_weights, weightsStride,
-                                                     d_Features, d_ClassifierNodes, d_Stages,
-                                                     d_inMask, d_outMask,
-                                                     mask1Dlen, mask2Dstride,
-                                                     anchorsRoi, startStageInc,
-                                                     endStageExc, scaleArea);
+    applyHaarClassifierAnchorParallelFunctor functor(texImg, texHaarFeatures, texHaarClassifierNodes, gridConf, blockConf, cuStream, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages,
+                                                     d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
 
     //Second parameter is the number of "dynamic" template parameters
     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
@@ -688,8 +598,9 @@ struct applyHaarClassifierClassifierParallelFunctor
 {
     dim3 gridConf, blockConf;
     cudaStream_t cuStream;
-
-    //Kernel arguments are stored as members;
+    cv::cudev::TexturePtr<Ncv32u> texImg;
+    cv::cudev::TexturePtr<uint2> texHaarFeatures;
+    cv::cudev::TexturePtr<uint4> texHaarClassifierNodes;
     Ncv32u *d_IImg;
     Ncv32u IImgStride;
     Ncv32f *d_weights;
@@ -707,32 +618,13 @@ struct applyHaarClassifierClassifierParallelFunctor
     Ncv32f scaleArea;
 
     //Arguments are passed through the constructor
-    applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
-                                                 Ncv32u *_d_IImg, Ncv32u _IImgStride,
-                                                 Ncv32f *_d_weights, Ncv32u _weightsStride,
-                                                 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
-                                                 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
-                                                 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
-                                                 NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
-                                                 Ncv32u _endStageExc, Ncv32f _scaleArea) :
-    gridConf(_gridConf),
-    blockConf(_blockConf),
-    cuStream(_cuStream),
-    d_IImg(_d_IImg),
-    IImgStride(_IImgStride),
-    d_weights(_d_weights),
-    weightsStride(_weightsStride),
-    d_Features(_d_Features),
-    d_ClassifierNodes(_d_ClassifierNodes),
-    d_Stages(_d_Stages),
-    d_inMask(_d_inMask),
-    d_outMask(_d_outMask),
-    mask1Dlen(_mask1Dlen),
-    mask2Dstride(_mask2Dstride),
-    anchorsRoi(_anchorsRoi),
-    startStageInc(_startStageInc),
-    endStageExc(_endStageExc),
-    scaleArea(_scaleArea)
+    applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, cv::cudev::TexturePtr<Ncv32u> texImg_, cv::cudev::TexturePtr<uint2> texHaarFeatures_,
+        cv::cudev::TexturePtr<uint4> texHaarClassifierNodes_, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features,
+        HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi,
+        Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) : gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_),
+        texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride), d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features),
+        d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen), mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi),
+        startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea)
     {}
 
     template<class TList>
@@ -743,40 +635,19 @@ struct applyHaarClassifierClassifierParallelFunctor
             Loki::TL::TypeAt<TList, 0>::Result::value,
             Loki::TL::TypeAt<TList, 1>::Result::value,
             Loki::TL::TypeAt<TList, 2>::Result::value >
-            <<<gridConf, blockConf, 0, cuStream>>>
-            (d_IImg, IImgStride,
-            d_weights, weightsStride,
-            d_Features, d_ClassifierNodes, d_Stages,
-            d_inMask, d_outMask,
-            mask1Dlen, mask2Dstride,
-            anchorsRoi, startStageInc,
-            endStageExc, scaleArea);
+            <<<gridConf, blockConf, 0, cuStream>>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask,
+                d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
     }
 };
 
 
-void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
-                                                      NcvBool tbCacheTextureCascade,
-                                                      NcvBool tbDoAtomicCompaction,
-
-                                                      dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
-
-                                                      Ncv32u *d_IImg, Ncv32u IImgStride,
-                                                      Ncv32f *d_weights, Ncv32u weightsStride,
-                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
-                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
-                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
-                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc,
-                                                      Ncv32u endStageExc, Ncv32f scaleArea)
+void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction, dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
+    cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights,
+    Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride,
+    NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
 {
-    applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
-                                                         d_IImg, IImgStride,
-                                                         d_weights, weightsStride,
-                                                         d_Features, d_ClassifierNodes, d_Stages,
-                                                         d_inMask, d_outMask,
-                                                         mask1Dlen, mask2Dstride,
-                                                         anchorsRoi, startStageInc,
-                                                         endStageExc, scaleArea);
+    applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features,
+        d_ClassifierNodes, d_Stages, d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
 
     //Second parameter is the number of "dynamic" template parameters
     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
@@ -1015,31 +886,15 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
 
     NCV_SKIP_COND_BEGIN
 
+    cv::cudev::Texture<Ncv32u> texImg;
     if (bTexCacheIImg)
-    {
-        cudaChannelFormatDesc cfdTexIImage;
-        cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
+        texImg = cv::cudev::Texture<Ncv32u>((anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch(), integral.ptr());
 
-        size_t alignmentOffset;
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
-            (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
-    }
-
-    if (bTexCacheCascade)
-    {
-        cudaChannelFormatDesc cfdTexHaarFeatures;
-        cudaChannelFormatDesc cfdTexHaarClassifierNodes;
-        cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
-        cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
-
-        size_t alignmentOffset;
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
-            d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
-            d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
+    cv::cudev::Texture<uint2> texHaarFeatures;
+    cv::cudev::Texture<uint4> texHaarClassifierNodes;
+    if (bTexCacheCascade) {
+        texHaarFeatures = cv::cudev::Texture<uint2>(sizeof(HaarFeature64) * haar.NumFeatures, reinterpret_cast<uint2*>(d_HaarFeatures.ptr()));
+        texHaarClassifierNodes = cv::cudev::Texture<uint4>(sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes, reinterpret_cast<uint4*>(d_HaarNodes.ptr()));
     }
 
     Ncv32u stageStartAnchorParallel = 0;
@@ -1130,26 +985,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
         dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
                    anchorsRoi.height);
         dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
-        applyHaarClassifierAnchorParallelDynTemplate(
-            true,                         //tbInitMaskPositively
-            bTexCacheIImg,                //tbCacheTextureIImg
-            bTexCacheCascade,             //tbCacheTextureCascade
-            pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
-            bDoAtomicCompaction,          //tbDoAtomicCompaction
-            grid1,
-            block1,
-            cuStream,
-            integral.ptr(), integral.stride(),
-            d_weights.ptr(), d_weights.stride(),
-            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
-            d_ptrNowData->ptr(),
-            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
-            0,
-            d_pixelMask.stride(),
-            anchorsRoi,
-            pixParallelStageStops[pixParallelStageStopsIndex],
-            pixParallelStageStops[pixParallelStageStopsIndex+1],
-            scaleAreaPixels);
+        applyHaarClassifierAnchorParallelDynTemplate( true, bTexCacheIImg, bTexCacheCascade,  pixParallelStageStops[pixParallelStageStopsIndex] != 0, bDoAtomicCompaction, grid1, block1, cuStream,
+            texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
+            d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), 0, d_pixelMask.stride(), anchorsRoi, pixParallelStageStops[pixParallelStageStopsIndex],
+            pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels);
         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
 
         if (bDoAtomicCompaction)
@@ -1200,26 +1039,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
         }
         dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
 
-        applyHaarClassifierAnchorParallelDynTemplate(
-            false,                        //tbInitMaskPositively
-            bTexCacheIImg,                //tbCacheTextureIImg
-            bTexCacheCascade,             //tbCacheTextureCascade
-            pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
-            bDoAtomicCompaction,          //tbDoAtomicCompaction
-            grid2,
-            block2,
-            cuStream,
-            integral.ptr(), integral.stride(),
-            d_weights.ptr(), d_weights.stride(),
-            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
-            d_ptrNowData->ptr(),
-            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
-            numDetections,
-            d_pixelMask.stride(),
-            anchorsRoi,
-            pixParallelStageStops[pixParallelStageStopsIndex],
-            pixParallelStageStops[pixParallelStageStopsIndex+1],
-            scaleAreaPixels);
+        applyHaarClassifierAnchorParallelDynTemplate( false, bTexCacheIImg, bTexCacheCascade, pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements, bDoAtomicCompaction,
+            grid2, block2, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(),
+            d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections, d_pixelMask.stride(), anchorsRoi,
+            pixParallelStageStops[pixParallelStageStopsIndex], pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels);
         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
 
         if (bDoAtomicCompaction)
@@ -1263,24 +1086,9 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
         }
         dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
 
-        applyHaarClassifierClassifierParallelDynTemplate(
-            bTexCacheIImg,                //tbCacheTextureIImg
-            bTexCacheCascade,             //tbCacheTextureCascade
-            bDoAtomicCompaction,          //tbDoAtomicCompaction
-            grid3,
-            block3,
-            cuStream,
-            integral.ptr(), integral.stride(),
-            d_weights.ptr(), d_weights.stride(),
-            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
-            d_ptrNowData->ptr(),
-            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
-            numDetections,
-            d_pixelMask.stride(),
-            anchorsRoi,
-            stageMiddleSwitch,
-            stageEndClassifierParallel,
-            scaleAreaPixels);
+        applyHaarClassifierClassifierParallelDynTemplate(bTexCacheIImg, bTexCacheCascade, bDoAtomicCompaction, grid3, block3, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(),
+            d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections,
+            d_pixelMask.stride(), anchorsRoi, stageMiddleSwitch, stageEndClassifierParallel, scaleAreaPixels);
         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
 
         if (bDoAtomicCompaction)
diff --git a/modules/cudalegacy/src/cuda/NPP_staging.cu b/modules/cudalegacy/src/cuda/NPP_staging.cu
index 90880d56cc5..6626526f737 100644
--- a/modules/cudalegacy/src/cuda/NPP_staging.cu
+++ b/modules/cudalegacy/src/cuda/NPP_staging.cu
@@ -48,12 +48,7 @@
 #include "opencv2/cudev.hpp"
 
 #include "opencv2/cudalegacy/NPP_staging.hpp"
-
-
-texture<Ncv8u,  1, cudaReadModeElementType> tex8u;
-texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
-texture<uint2,  1, cudaReadModeElementType> tex64u;
-
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 //==============================================================================
 //
@@ -71,7 +66,6 @@ cudaStream_t nppStGetActiveCUDAstream(void)
 }
 
 
-
 cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
 {
     cudaStream_t tmp = nppStream;
@@ -117,25 +111,25 @@ private:
 
 
 template<class T>
-inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
+inline __device__ T readElem(cv::cudev::TexturePtr<Ncv8u> tex8u, T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
 
 
 template<>
-inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv8u readElem<Ncv8u>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv8u* d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
-    return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
+    return tex8u(texOffs + srcStride * blockIdx.x + curElemOffs);
 }
 
 
 template<>
-inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv32u readElem<Ncv32u>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
     return d_src[curElemOffs];
 }
 
 
 template<>
-inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv32f readElem<Ncv32f>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
     return d_src[curElemOffs];
 }
@@ -160,8 +154,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u
 * \return None
 */
 template <class T_in, class T_out, bool tbDoSqr>
-__global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
-                         T_out *d_II, Ncv32u IIstride)
+__global__ void scanRows(cv::cudev::TexturePtr<Ncv8u> tex8u, T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride, T_out *d_II, Ncv32u IIstride)
 {
     //advance pointers to the current line
     if (sizeof(T_in) != 1)
@@ -190,7 +183,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
         if (curElemOffs < srcWidth)
         {
             //load elements
-            curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
+            curElem = readElem<T_in>(tex8u, d_src, texOffs, srcStride, curElemOffs);
         }
         curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
 
@@ -224,25 +217,9 @@ template <bool tbDoSqr, class T_in, class T_out>
 NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
                                 T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
 {
-    cudaChannelFormatDesc cfdTex;
-    size_t alignmentOffset = 0;
-    if (sizeof(T_in) == 1)
-    {
-        cfdTex = cudaCreateChannelDesc<Ncv8u>();
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
-        if (alignmentOffset > 0)
-        {
-            ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
-        }
-    }
-    scanRows
-        <T_in, T_out, tbDoSqr>
-        <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
-        (d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
-
+    cv::cudev::Texture<Ncv8u> tex8u(static_cast<size_t>(roi.height * srcStride), (Ncv8u*)d_src);
+    scanRows <T_in, T_out, tbDoSqr> <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>> (tex8u, d_src, 0, roi.width, srcStride, d_dst, dstStride);
     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
-
     return NPPST_SUCCESS;
 }
 
@@ -585,59 +562,25 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
 const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32;
 const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
 
-
-template<class T, NcvBool tbCacheTexture>
-__device__ T getElem_Decimate(Ncv32u x, T *d_src);
-
-
-template<>
-__device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
-{
-    return tex1Dfetch(tex32u, x);
-}
-
-
-template<>
-__device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
-{
-    return d_src[x];
-}
-
-
-template<>
-__device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
-{
-    uint2 tmp = tex1Dfetch(tex64u, x);
-    Ncv64u res = (Ncv64u)tmp.y;
-    res <<= 32;
-    res |= tmp.x;
-    return res;
-}
-
-
-template<>
-__device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
+template <class T>
+__global__ void decimate_C1R(T* d_src, Ncv32u srcStep, T* d_dst, Ncv32u dstStep, NcvSize32u dstRoi, Ncv32u scale)
 {
-    return d_src[x];
+    int curX = blockIdx.x * blockDim.x + threadIdx.x;
+    int curY = blockIdx.y * blockDim.y + threadIdx.y;
+    if (curX >= dstRoi.width || curY >= dstRoi.height) return;
+    d_dst[curY * dstStep + curX] = d_src[(curY * srcStep + curX) * scale];
 }
 
-
-template <class T, NcvBool tbCacheTexture>
-__global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
-                                      NcvSize32u dstRoi, Ncv32u scale)
+template <class T>
+__global__ void decimate_C1R(cv::cudev::TexturePtr<T> texSrc, Ncv32u srcStep, T* d_dst, Ncv32u dstStep,
+    NcvSize32u dstRoi, Ncv32u scale)
 {
     int curX = blockIdx.x * blockDim.x + threadIdx.x;
     int curY = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (curX >= dstRoi.width || curY >= dstRoi.height)
-    {
-        return;
-    }
-
-    d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
+    if (curX >= dstRoi.width || curY >= dstRoi.height) return;
+    d_dst[curY * dstStep + curX] = texSrc((curY * srcStep + curX) * scale);
 }
 
-
 template <class T>
 static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
                                                 T *d_dst, Ncv32u dstStep,
@@ -659,39 +602,12 @@ static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
     dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X,
               (dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
     dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
-
-    if (!readThruTexture)
-    {
-        decimate_C1R
-            <T, false>
-            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
-            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
+    if (!readThruTexture) {
+        decimate_C1R<T><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
     }
-    else
-    {
-        cudaChannelFormatDesc cfdTexSrc;
-
-        if (sizeof(T) == sizeof(Ncv32u))
-        {
-            cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
-
-            size_t alignmentOffset;
-            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
-            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
-        }
-        else
-        {
-            cfdTexSrc = cudaCreateChannelDesc<uint2>();
-
-            size_t alignmentOffset;
-            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
-            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
-        }
-
-        decimate_C1R
-            <T, true>
-            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
-            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
+    else {
+        cv::cudev::Texture<T> texSrc(srcRoi.height * srcStep * sizeof(T), d_src);
+        decimate_C1R<T> << <grid, block, 0, nppStGetActiveCUDAstream() >> > (texSrc, srcStep, d_dst, dstStep, dstRoi, scale);
     }
 
     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@@ -753,11 +669,7 @@ static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
 
 
 implementNppDecimate(32, u)
-implementNppDecimate(32, s)
-implementNppDecimate(32, f)
 implementNppDecimate(64, u)
-implementNppDecimate(64, s)
-implementNppDecimate(64, f)
 implementNppDecimateHost(32, u)
 implementNppDecimateHost(32, s)
 implementNppDecimateHost(32, f)
@@ -776,43 +688,29 @@ implementNppDecimateHost(64, f)
 const Ncv32u NUM_RECTSTDDEV_THREADS = 128;
 
 
-template <NcvBool tbCacheTexture>
-__device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum)
+template <NcvBool tbCacheTexture, class Ptr2D>
+__device__ Ncv32u getElemSum(Ptr2D tex, Ncv32u x, Ncv32u *d_sum)
 {
     if (tbCacheTexture)
-    {
-        return tex1Dfetch(tex32u, x);
-    }
+        return tex(x);
     else
-    {
         return d_sum[x];
-    }
 }
 
 
-template <NcvBool tbCacheTexture>
-__device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum)
+template <NcvBool tbCacheTexture, class Ptr2D>
+__device__ Ncv64u getElemSqSum(Ptr2D tex, Ncv32u x, Ncv64u *d_sqsum)
 {
     if (tbCacheTexture)
-    {
-        uint2 tmp = tex1Dfetch(tex64u, x);
-        Ncv64u res = (Ncv64u)tmp.y;
-        res <<= 32;
-        res |= tmp.x;
-        return res;
-    }
+        return tex(x);
     else
-    {
         return d_sqsum[x];
-    }
 }
 
 
 template <NcvBool tbCacheTexture>
-__global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
-                                   Ncv64u *d_sqsum, Ncv32u sqsumStep,
-                                   Ncv32f *d_norm, Ncv32u normStep,
-                                   NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
+__global__ void rectStdDev_32f_C1R(cv::cudev::TexturePtr<Ncv32u> texSum, cv::cudev::TexturePtr<Ncv64u> texSumSq, Ncv32u *d_sum, Ncv32u sumStep, Ncv64u *d_sqsum, Ncv32u sqsumStep,
+    Ncv32f *d_norm, Ncv32u normStep, NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
 {
     Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x;
     if (x_offs >= roi.width)
@@ -824,17 +722,17 @@ __global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
     Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs;
 
     //OPT: try swapping order (could change cache hit/miss ratio)
-    Ncv32u sum_tl = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x, d_sum);
-    Ncv32u sum_bl = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
-    Ncv32u sum_tr = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
-    Ncv32u sum_br = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
+    Ncv32u sum_tl = getElemSum<tbCacheTexture>(texSum, sum_offset + rect.y * sumStep + rect.x, d_sum);
+    Ncv32u sum_bl = getElemSum<tbCacheTexture>(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
+    Ncv32u sum_tr = getElemSum<tbCacheTexture>(texSum, sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
+    Ncv32u sum_br = getElemSum<tbCacheTexture>(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
     Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl;
 
     Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br;
-    sqsum_tl = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
-    sqsum_bl = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
-    sqsum_tr = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
-    sqsum_br = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
+    sqsum_tl = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
+    sqsum_bl = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
+    sqsum_tr = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
+    sqsum_br = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
     Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl;
 
     Ncv32f mean = sum_val * invRectArea;
@@ -897,31 +795,12 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
     dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height);
     dim3 block(NUM_RECTSTDDEV_THREADS);
 
+    cv::cudev::Texture<Ncv32u> texSum((roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u), d_sum);
+    cv::cudev::Texture<Ncv64u> texSumSq((roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u), d_sqsum);
     if (!readThruTexture)
-    {
-        rectStdDev_32f_C1R
-            <false>
-            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
-            (d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
-    }
+        rectStdDev_32f_C1R<false><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSum, texSumSq, d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
     else
-    {
-        cudaChannelFormatDesc cfdTexSrc;
-        cudaChannelFormatDesc cfdTexSqr;
-        cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
-        cfdTexSqr = cudaCreateChannelDesc<uint2>();
-
-        size_t alignmentOffset;
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
-
-        rectStdDev_32f_C1R
-            <true>
-            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
-            (NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
-    }
+        rectStdDev_32f_C1R<true><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSum, texSumSq, NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
 
     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
 
@@ -1553,40 +1432,24 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
 //
 //==============================================================================
 
-
-texture <float, 1, cudaReadModeElementType> texSrc;
-texture <float, 1, cudaReadModeElementType> texKernel;
-
-
-__forceinline__ __device__ float getValueMirrorRow(const int rowOffset,
-                                                   int i,
-                                                   int w)
+__forceinline__ __device__ float getValueMirrorRow(cv::cudev::TexturePtr< Ncv32f> tex, const int rowOffset, int i, int w)
 {
     if (i < 0) i = 1 - i;
     if (i >= w) i = w + w - i - 1;
-    return tex1Dfetch (texSrc, rowOffset + i);
+    return tex(rowOffset + i);
 }
 
 
-__forceinline__ __device__ float getValueMirrorColumn(const int offset,
-                                                      const int rowStep,
-                                                      int j,
-                                                      int h)
+__forceinline__ __device__ float getValueMirrorColumn(cv::cudev::TexturePtr< Ncv32f> tex, const int offset, const int rowStep, int j, int h)
 {
     if (j < 0) j = 1 - j;
     if (j >= h) j = h + h - j - 1;
-    return tex1Dfetch (texSrc, offset + j * rowStep);
+    return tex(offset + j * rowStep);
 }
 
 
-__global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
-                                              Ncv32f *pDst,
-                                              NcvSize32u dstSize,
-                                              Ncv32u dstStep,
-                                              NcvRect32u roi,
-                                              Ncv32s nKernelSize,
-                                              Ncv32s nAnchor,
-                                              Ncv32f multiplier)
+__global__ void FilterRowBorderMirror_32f_C1R(cv::cudev::TexturePtr<Ncv32f> texSrc, cv::cudev::TexturePtr<Ncv32f> texKernel1, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep,
+    NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier)
 {
     // position within ROI
     const int ix = blockDim.x * blockIdx.x + threadIdx.x;
@@ -1606,22 +1469,16 @@ __global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
     float sum = 0.0f;
     for (int m = 0; m < nKernelSize; ++m)
     {
-        sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width)
-            * tex1Dfetch (texKernel, m);
+        sum += getValueMirrorRow(texSrc, rowOffset, ix + m - p, roi.width)
+            * texKernel1(m);
     }
 
     pDst[iy * dstStep + ix] = sum * multiplier;
 }
 
 
-__global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
-                                                 Ncv32f *pDst,
-                                                 NcvSize32u dstSize,
-                                                 Ncv32u dstStep,
-                                                 NcvRect32u roi,
-                                                 Ncv32s nKernelSize,
-                                                 Ncv32s nAnchor,
-                                                 Ncv32f multiplier)
+__global__ void FilterColumnBorderMirror_32f_C1R(cv::cudev::TexturePtr<Ncv32f> texSrc, cv::cudev::TexturePtr<Ncv32f> texKernel, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep,
+    NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier)
 {
     const int ix = blockDim.x * blockIdx.x + threadIdx.x;
     const int iy = blockDim.y * blockIdx.y + threadIdx.y;
@@ -1638,15 +1495,15 @@ __global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
     float sum = 0.0f;
     for (int m = 0; m < nKernelSize; ++m)
     {
-        sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height)
-            * tex1Dfetch (texKernel, m);
+        sum += getValueMirrorColumn(texSrc, offset, srcStep, iy + m - p, roi.height)
+            * texKernel(m);
     }
 
     pDst[ix + iy * dstStep] = sum * multiplier;
 }
 
 
-NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc,
                                         NcvSize32u srcSize,
                                         Ncv32u nSrcStep,
                                         Ncv32f *pDst,
@@ -1654,7 +1511,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
                                         Ncv32u nDstStep,
                                         NcvRect32u oROI,
                                         NppStBorderType borderType,
-                                        const Ncv32f *pKernel,
+                                        Ncv32f *pKernel,
                                         Ncv32s nKernelSize,
                                         Ncv32s nAnchor,
                                         Ncv32f multiplier)
@@ -1686,12 +1543,8 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
         oROI.height = srcSize.height - oROI.y;
     }
 
-    cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
-    texSrc.normalized    = false;
-    texKernel.normalized = false;
-
-    cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
-    cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
+    cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
+    cv::cudev::Texture<Ncv32f> texKernel(nKernelSize * sizeof(Ncv32f), pKernel);
 
     dim3 ctaSize (32, 6);
     dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
@@ -1706,8 +1559,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
     case nppStBorderWrap:
         return NPPST_ERROR;
     case nppStBorderMirror:
-        FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
-            (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
+        FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
         ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
         break;
     default:
@@ -1718,7 +1570,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
 }
 
 
-NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc,
                                            NcvSize32u srcSize,
                                            Ncv32u nSrcStep,
                                            Ncv32f *pDst,
@@ -1726,7 +1578,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
                                            Ncv32u nDstStep,
                                            NcvRect32u oROI,
                                            NppStBorderType borderType,
-                                           const Ncv32f *pKernel,
+                                           Ncv32f *pKernel,
                                            Ncv32s nKernelSize,
                                            Ncv32s nAnchor,
                                            Ncv32f multiplier)
@@ -1758,12 +1610,8 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
         oROI.height = srcSize.height - oROI.y;
     }
 
-    cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
-    texSrc.normalized    = false;
-    texKernel.normalized = false;
-
-    cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
-    cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
+    cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
+    cv::cudev::Texture<Ncv32f> texKernel(nKernelSize * sizeof(Ncv32f), pKernel);
 
     dim3 ctaSize (32, 6);
     dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
@@ -1776,8 +1624,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
     case nppStBorderWrap:
         return NPPST_ERROR;
     case nppStBorderMirror:
-        FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
-            (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
+        FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
         ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
         break;
     default:
@@ -1800,16 +1647,11 @@ inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom)
     return (num + denom - 1)/denom;
 }
 
-
-texture<float, 2, cudaReadModeElementType> tex_src1;
-texture<float, 2, cudaReadModeElementType> tex_src0;
-
-
-__global__ void BlendFramesKernel(const float *u, const float *v,   // forward flow
-                                  const float *ur, const float *vr, // backward flow
-                                  const float *o0, const float *o1, // coverage masks
-                                  int w, int h, int s,
-                                  float theta, float *out)
+__global__ void BlendFramesKernel(cv::cudev::TexturePtr<Ncv32f> texSrc0, cv::cudev::TexturePtr<Ncv32f> texSrc1,
+    const float *u, const float *v,   // forward flow
+    const float *ur, const float *vr, // backward flow
+    const float *o0, const float *o1, // coverage masks
+    int w, int h, int s, float theta, float *out)
 {
     const int ix = threadIdx.x + blockDim.x * blockIdx.x;
     const int iy = threadIdx.y + blockDim.y * blockIdx.y;
@@ -1829,27 +1671,17 @@ __global__ void BlendFramesKernel(const float *u, const float *v,   // forward f
     bool b0 = o0[pos] > 1e-4f;
     bool b1 = o1[pos] > 1e-4f;
 
-    if (b0 && b1)
-    {
-        // pixel is visible on both frames
-        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) +
-            tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta;
-    }
-    else if (b0)
-    {
-        // visible on the first frame only
-        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta);
-    }
-    else
-    {
-        // visible on the second frame only
-        out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta));
-    }
+    if (b0 && b1) // pixel is visible on both frames
+        out[pos] = texSrc0(y - _v * theta, x - _u * theta)* (1.0f - theta) + texSrc0(y + _v * (1.0f - theta), x + _u * (1.0f - theta)) * theta;
+    else if (b0) // visible on the first frame only
+        out[pos] = texSrc0(y - _v * theta, x - _u * theta);
+    else // visible on the second frame only
+        out[pos] = texSrc1(y - _vr * (1.0f - theta), x - _ur * (1.0f - theta));
 }
 
 
-NCVStatus BlendFrames(const Ncv32f *src0,
-                      const Ncv32f *src1,
+NCVStatus BlendFrames(Ncv32f *src0,
+                      Ncv32f *src1,
                       const Ncv32f *ufi,
                       const Ncv32f *vfi,
                       const Ncv32f *ubi,
@@ -1862,29 +1694,13 @@ NCVStatus BlendFrames(const Ncv32f *src0,
                       Ncv32f theta,
                       Ncv32f *out)
 {
-    tex_src1.addressMode[0] = cudaAddressModeClamp;
-    tex_src1.addressMode[1] = cudaAddressModeClamp;
-    tex_src1.filterMode = cudaFilterModeLinear;
-    tex_src1.normalized = false;
-
-    tex_src0.addressMode[0] = cudaAddressModeClamp;
-    tex_src0.addressMode[1] = cudaAddressModeClamp;
-    tex_src0.filterMode = cudaFilterModeLinear;
-    tex_src0.normalized = false;
-
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
     const Ncv32u pitch = stride * sizeof (float);
-    ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
-    ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
-
+    cv::cudev::Texture<Ncv32f> texSrc0(height, width, src0, pitch, false, cudaFilterModeLinear);
+    cv::cudev::Texture<Ncv32f> texSrc1(height, width, src1, pitch, false, cudaFilterModeLinear);
     dim3 threads (32, 4);
     dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y));
-
-    BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
-        (ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
-
+    BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>(texSrc0, texSrc1, ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
-
     return NPPST_SUCCESS;
 }
 
@@ -2255,44 +2071,27 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
 //
 //==============================================================================
 
-
-texture <float, 2, cudaReadModeElementType> texSrc2D;
-
-
 __forceinline__
-__device__ float processLine(int spos,
-                             float xmin,
-                             float xmax,
-                             int ixmin,
-                             int ixmax,
-                             float fxmin,
-                             float cxmax)
+__device__ float processLine(cv::cudev::TexturePtr<Ncv32f> tex, int spos, float xmin, float xmax, int ixmin, int ixmax, float fxmin, float cxmax)
 {
     // first element
     float wsum = 1.0f - xmin + fxmin;
-    float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin);
+    float sum = tex( spos) * (1.0f - xmin + fxmin);
     spos++;
     for (int ix = ixmin + 1; ix < ixmax; ++ix)
     {
-        sum += tex1Dfetch(texSrc, spos);
+        sum += tex(spos);
         spos++;
         wsum += 1.0f;
     }
-    sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax);
+    sum += tex(spos) * (cxmax - xmax);
     wsum += cxmax - xmax;
     return sum / wsum;
 }
 
 
-__global__ void resizeSuperSample_32f(NcvSize32u srcSize,
-                                      Ncv32u srcStep,
-                                      NcvRect32u srcROI,
-                                      Ncv32f *dst,
-                                      NcvSize32u dstSize,
-                                      Ncv32u dstStep,
-                                      NcvRect32u dstROI,
-                                      Ncv32f scaleX,
-                                      Ncv32f scaleY)
+__global__ void resizeSuperSample_32f(cv::cudev::TexturePtr<Ncv32f> texSrc, NcvSize32u srcSize, Ncv32u srcStep, NcvRect32u srcROI, Ncv32f *dst, NcvSize32u dstSize, Ncv32u dstStep,
+    NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY)
 {
     // position within dst ROI
     const int ix = blockIdx.x * blockDim.x + threadIdx.x;
@@ -2332,18 +2131,18 @@ __global__ void resizeSuperSample_32f(NcvSize32u srcSize,
 
     float wsum = 1.0f - yBegin + floorYBegin;
 
-    float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+    float sum = processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
         ceilXEnd) * (1.0f - yBegin + floorYBegin);
     pos += srcStep;
     for (int iy = iYBegin + 1; iy < iYEnd; ++iy)
     {
-        sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+        sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
             ceilXEnd);
         pos += srcStep;
         wsum += 1.0f;
     }
 
-    sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+    sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
         ceilXEnd) * (ceilYEnd - yEnd);
     wsum += ceilYEnd - yEnd;
     sum /= wsum;
@@ -2372,14 +2171,7 @@ __device__ float bicubicCoeff(float x_)
 }
 
 
-__global__ void resizeBicubic(NcvSize32u srcSize,
-                              NcvRect32u srcROI,
-                              NcvSize32u dstSize,
-                              Ncv32u dstStep,
-                              Ncv32f *dst,
-                              NcvRect32u dstROI,
-                              Ncv32f scaleX,
-                              Ncv32f scaleY)
+__global__ void resizeBicubic(cv::cudev::TexturePtr<Ncv32f> texSrc, NcvSize32u srcSize, NcvRect32u srcROI, NcvSize32u dstSize, Ncv32u dstStep, Ncv32f *dst, NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY)
 {
     const int ix = blockIdx.x * blockDim.x + threadIdx.x;
     const int iy = blockIdx.y * blockDim.y + threadIdx.y;
@@ -2433,7 +2225,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
             float wx = bicubicCoeff (xDist);
             float wy = bicubicCoeff (yDist);
             wx *= wy;
-            sum += wx * tex2D (texSrc2D, cx * dx, cy * dy);
+            sum += wx * texSrc(cy * dy, cx * dx);
             wsum += wx;
         }
     }
@@ -2441,7 +2233,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
 }
 
 
-NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc,
                                NcvSize32u srcSize,
                                Ncv32u nSrcStep,
                                NcvRect32u srcROI,
@@ -2469,33 +2261,17 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
 
     if (interpolation == nppStSupersample)
     {
-        // bind texture
-        cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep);
-        // invoke kernel
+        cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
         dim3 ctaSize (32, 6);
-        dim3 gridSize ((dstROI.width  + ctaSize.x - 1) / ctaSize.x,
-            (dstROI.height + ctaSize.y - 1) / ctaSize.y);
-
-        resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
-            (srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
+        dim3 gridSize ((dstROI.width  + ctaSize.x - 1) / ctaSize.x,(dstROI.height + ctaSize.y - 1) / ctaSize.y);
+        resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> (texSrc, srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
     }
     else if (interpolation == nppStBicubic)
     {
-        texSrc2D.addressMode[0] = cudaAddressModeMirror;
-        texSrc2D.addressMode[1] = cudaAddressModeMirror;
-        texSrc2D.normalized = true;
-
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
-
-        cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height,
-            nSrcStep);
-
+        cv::cudev::Texture<float> texSrc(srcSize.height, srcSize.width, pSrc, nSrcStep, true, cudaFilterModePoint, cudaAddressModeMirror);
         dim3 ctaSize (32, 6);
-        dim3 gridSize ((dstSize.width  + ctaSize.x - 1) / ctaSize.x,
-            (dstSize.height + ctaSize.y - 1) / ctaSize.y);
-
-        resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
-            (srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
+        dim3 gridSize ((dstSize.width  + ctaSize.x - 1) / ctaSize.x, (dstSize.height + ctaSize.y - 1) / ctaSize.y);
+        resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> (texSrc, srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
     }
     else
     {
diff --git a/modules/cudalegacy/src/cuda/bm.cu b/modules/cudalegacy/src/cuda/bm.cu
index 1307a8e3275..546f0903b05 100644
--- a/modules/cudalegacy/src/cuda/bm.cu
+++ b/modules/cudalegacy/src/cuda/bm.cu
@@ -46,29 +46,27 @@
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 using namespace cv::cuda;
 using namespace cv::cuda::device;
 
 namespace optflowbm
 {
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
+    __device__ int cmpBlocks(cv::cudev::TexturePtr<uchar> texCurr, cv::cudev::TexturePtr<uchar> texPrev, int X1, int Y1, int X2, int Y2, int2 blockSize)
     {
         int s = 0;
 
         for (int y = 0; y < blockSize.y; ++y)
         {
             for (int x = 0; x < blockSize.x; ++x)
-                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
+                s += ::abs(texPrev(Y1 + y, X1 + x) -texCurr(Y2 + y, X2 + x));
         }
 
         return s;
     }
 
-    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
+    __global__ void calcOptFlowBM(cv::cudev::TexturePtr<uchar> texPrev, cv::cudev::TexturePtr<uchar> texCurr, PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
                                   const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
                                   const short2* ss, const int ssCount)
     {
@@ -90,7 +88,7 @@ namespace optflowbm
         int dist = numeric_limits<int>::max();
 
         if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
-            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+            dist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize);
 
         int countMin = 1;
         int sumx = offX;
@@ -111,7 +109,7 @@ namespace optflowbm
 
                 if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
                 {
-                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+                    const int tmpDist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize);
                     if (tmpDist < acceptLevel)
                     {
                         sumx = dx;
@@ -151,16 +149,12 @@ namespace optflowbm
     void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
               int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
     {
-        bindTexture(&tex_prev, prev);
-        bindTexture(&tex_curr, curr);
-
+        cv::cudev::Texture<uchar> texPrev(prev);
+        cv::cudev::Texture<uchar> texCurr(curr);
         const dim3 block(32, 8);
         const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
-
-        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
-                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
+        calcOptFlowBM<<<grid, block, 0, stream>>>(texPrev, texCurr, velx, vely, blockSize, shiftSize, usePrevious, maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
         cudaSafeCall( cudaGetLastError() );
-
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
diff --git a/modules/cudalegacy/test/TestHypothesesGrow.cpp b/modules/cudalegacy/test/TestHypothesesGrow.cpp
index e7fe4d939df..ad4c3c9df3c 100644
--- a/modules/cudalegacy/test/TestHypothesesGrow.cpp
+++ b/modules/cudalegacy/test/TestHypothesesGrow.cpp
@@ -100,7 +100,8 @@ bool TestHypothesesGrow::process()
 
     NCV_SKIP_COND_BEGIN
     ncvAssertReturn(this->src.fill(h_vecSrc), false);
-    memset(h_vecDst.ptr(), 0, h_vecDst.length() * sizeof(NcvRect32u));
+
+    *h_vecDst.ptr() = {};
     NCVVectorReuse<Ncv32u> h_vecDst_as32u(h_vecDst.getSegment(), lenDst * sizeof(NcvRect32u) / sizeof(Ncv32u));
     ncvAssertReturn(h_vecDst_as32u.isMemReused(), false);
     ncvAssertReturn(this->src.fill(h_vecDst_as32u), false);
diff --git a/modules/cudaobjdetect/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu
index 5c12860620a..c7d72bfa9f8 100644
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
@@ -46,6 +46,7 @@
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/warp_shuffle.hpp"
+#include  <opencv2/cudev/ptr2d/texture.hpp>
 
 namespace cv { namespace cuda { namespace device
 {
@@ -825,64 +826,44 @@ namespace cv { namespace cuda { namespace device
         //-------------------------------------------------------------------
         // Resize
 
-        texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
-        texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;
-
-        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
+        __global__ void resize_for_hog_kernel(cv::cudev::TexturePtr<uchar> src, float sx, float sy, PtrStepSz<uchar> dst)
         {
             unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
             unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 
             if (x < dst.cols && y < dst.rows)
-                dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
+                dst.ptr(y)[x] = src(x * sx, y * sy) * 255;
         }
 
-        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs)
+        __global__ void resize_for_hog_kernel(cv::cudev::TexturePtr<uchar, uchar4> src, float sx, float sy, PtrStepSz<uchar4> dst)
         {
             unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
             unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 
             if (x < dst.cols && y < dst.rows)
             {
-                float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
+                uchar4 val = src(x * sx, y * sy);
                 dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
             }
         }
 
-        template<class T, class TEX>
-        static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
+        template<class T>
+        static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst)
         {
-            tex.filterMode = cudaFilterModeLinear;
-
-            size_t texOfs = 0;
-            int colOfs = 0;
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-            cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
-
-            if (texOfs != 0)
-            {
-                colOfs = static_cast<int>( texOfs/sizeof(T) );
-                cudaSafeCall( cudaUnbindTexture(tex) );
-                cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
-            }
-
+            cv::cudev::Texture<uchar, T> tex(src.rows, src.cols, src.data, src.step, false, cudaFilterModeLinear, cudaAddressModeClamp, cudaReadModeNormalizedFloat);
             dim3 threads(32, 8);
             dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
 
             float sx = static_cast<float>(src.cols) / dst.cols;
             float sy = static_cast<float>(src.rows) / dst.rows;
 
-            resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs);
+            resize_for_hog_kernel<<<grid, threads>>>(tex, sx, sy, (PtrStepSz<T>)dst);
             cudaSafeCall( cudaGetLastError() );
-
             cudaSafeCall( cudaDeviceSynchronize() );
-
-            cudaSafeCall( cudaUnbindTexture(tex) );
         }
 
-        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
-        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst); }
+        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst); }
     } // namespace hog
 }}} // namespace cv { namespace cuda { namespace cudev
 
diff --git a/modules/cudaobjdetect/test/test_objdetect.cpp b/modules/cudaobjdetect/test/test_objdetect.cpp
index 4843cc483ef..b12ad37f6dc 100644
--- a/modules/cudaobjdetect/test/test_objdetect.cpp
+++ b/modules/cudaobjdetect/test/test_objdetect.cpp
@@ -222,7 +222,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
 */
 //============== caltech hog tests =====================//
 
-struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string> >
+struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string, bool> >
 {
     cv::cuda::DeviceInfo devInfo;
     cv::Mat img;
@@ -232,7 +232,13 @@ struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std
         devInfo = GET_PARAM(0);
         cv::cuda::setDevice(devInfo.deviceID());
 
-        img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+        const bool grayScale = GET_PARAM(2);
+        if(grayScale)
+            img = readImage(GET_PARAM(1), IMREAD_GRAYSCALE);
+        else {
+            Mat imgBgr = readImage(GET_PARAM(1));
+            cv::cvtColor(imgBgr, img, COLOR_BGR2BGRA);
+        }
         ASSERT_FALSE(img.empty());
     }
 };
@@ -263,10 +269,11 @@ CUDA_TEST_P(CalTech, HOG)
 #endif
 }
 
+#define GREYSCALE true, false
 INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES,
     ::testing::Values<std::string>("caltech/image_00000009_0.png", "caltech/image_00000032_0.png",
         "caltech/image_00000165_0.png", "caltech/image_00000261_0.png", "caltech/image_00000469_0.png",
-        "caltech/image_00000527_0.png", "caltech/image_00000574_0.png")));
+        "caltech/image_00000527_0.png", "caltech/image_00000574_0.png"), testing::Values(GREYSCALE)));
 
 
 //------------------------variable GPU HOG Tests------------------------//
diff --git a/modules/cudaoptflow/src/cuda/pyrlk.cu b/modules/cudaoptflow/src/cuda/pyrlk.cu
index ca9759c2e53..da53046eae4 100644
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@@ -50,8 +50,7 @@
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/filters.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
-
-#include <iostream>
+#include  <opencv2/cudev/ptr2d/texture.hpp>
 
 using namespace cv::cuda;
 using namespace cv::cuda::device;
@@ -64,224 +63,6 @@ namespace pyrlk
     __constant__ int c_halfWin_y;
     __constant__ int c_iters;
 
-    texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-    texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-    texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-
-    template <int cn, typename T> struct Tex_I
-    {
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T, cn>::vec_type> I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-
-    template <> struct Tex_I<1, uchar>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_I8U, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& I)
-        {
-            bindTexture(&tex_I8U, I);
-        }
-    };
-    template <> struct Tex_I<1, ushort>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return 0.0;
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort>& I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    template <> struct Tex_I<1, int>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return 0.0;
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<int>& I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    template <> struct Tex_I<1, float>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_If, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& I)
-        {
-            bindTexture(&tex_If, I);
-        }
-    };
-    // ****************** 3 channel specializations ************************
-    template <> struct Tex_I<3, uchar>
-    {
-        static __device__ __forceinline__ float3 read(float x, float y)
-        {
-            return make_float3(0,0,0);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar3> I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    template <> struct Tex_I<3, ushort>
-    {
-        static __device__ __forceinline__ float3 read(float x, float y)
-        {
-            return make_float3(0, 0, 0);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort3> I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    template <> struct Tex_I<3, int>
-    {
-        static __device__ __forceinline__ float3 read(float x, float y)
-        {
-            return make_float3(0, 0, 0);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<int3> I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    template <> struct Tex_I<3, float>
-    {
-        static __device__ __forceinline__ float3 read(float x, float y)
-        {
-            return make_float3(0, 0, 0);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float3> I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    // ****************** 4 channel specializations ************************
-
-    template <> struct Tex_I<4, uchar>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_I8UC4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& I)
-        {
-            bindTexture(&tex_I8UC4, I);
-        }
-    };
-    template <> struct Tex_I<4, ushort>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_I16UC4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& I)
-        {
-            bindTexture(&tex_I16UC4, I);
-        }
-    };
-    template <> struct Tex_I<4, float>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_If4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& I)
-        {
-            bindTexture(&tex_If4, I);
-        }
-    };
-    // ************* J  ***************
-    template <int cn, typename T> struct Tex_J
-    {
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T,cn>::vec_type>& J)
-        {
-            CV_UNUSED(J);
-        }
-    };
-    template <> struct Tex_J<1, uchar>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_J8U, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& J)
-        {
-            bindTexture(&tex_J8U, J);
-        }
-    };
-    template <> struct Tex_J<1, float>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_Jf, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& J)
-        {
-            bindTexture(&tex_Jf, J);
-        }
-    };
-    // ************* 4 channel specializations ***************
-    template <> struct Tex_J<4, uchar>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_J8UC4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& J)
-        {
-            bindTexture(&tex_J8UC4, J);
-        }
-    };
-    template <> struct Tex_J<4, ushort>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_J16UC4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& J)
-        {
-            bindTexture(&tex_J16UC4, J);
-        }
-    };
-    template <> struct Tex_J<4, float>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_Jf4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& J)
-        {
-            bindTexture(&tex_Jf4, J);
-        }
-    };
-
     __device__ __forceinline__ void accum(float& dst, const float& val)
     {
         dst += val;
@@ -364,8 +145,8 @@ namespace pyrlk
         }
     };
 
-    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T>
-    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
+    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T, class Ptr2D>
+    __global__ void sparseKernel(const Ptr2D texI, const Ptr2D texJ, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
     {
     #if __CUDA_ARCH__ <= 110
         const int BLOCK_SIZE = 128;
@@ -413,15 +194,14 @@ namespace pyrlk
                 float x = prevPt.x + xBase + 0.5f;
                 float y = prevPt.y + yBase + 0.5f;
 
-                I_patch[i][j] = Tex_I<cn, T>::read(x, y);
+                I_patch[i][j] = texI(y, x);
 
                 // Scharr Deriv
+                work_type dIdx = 3.0f * texI(y - 1, x + 1) + 10.0f * texI(y, x + 1) + 3.0f * texI(y + 1, x + 1) -
+                    (3.0f * texI(y - 1, x - 1) + 10.0f * texI(y, x - 1) + 3.0f * texI(y + 1, x - 1));
 
-                work_type dIdx = 3.0f * Tex_I<cn,T>::read(x+1, y-1) + 10.0f * Tex_I<cn, T>::read(x+1, y) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
-                                 (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x-1, y) + 3.0f * Tex_I<cn,T>::read(x-1, y+1));
-
-                work_type dIdy = 3.0f * Tex_I<cn,T>::read(x-1, y+1) + 10.0f * Tex_I<cn, T>::read(x, y+1) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
-                                (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x, y-1) + 3.0f * Tex_I<cn,T>::read(x+1, y-1));
+                work_type dIdy = 3.0f * texI(y + 1, x - 1) + 10.0f * texI(y + 1, x) + 3.0f * texI(y + 1, x + 1) -
+                    (3.0f * texI(y - 1, x - 1) + 10.0f * texI(y - 1, x) + 3.0f * texI(y - 1, x + 1));
 
                 dIdx_patch[i][j] = dIdx;
                 dIdy_patch[i][j] = dIdy;
@@ -490,7 +270,8 @@ namespace pyrlk
                 for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
                 {
                     work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
+                    work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f);
 
                     work_type diff = (J_val - I_val) * 32.0f;
 
@@ -533,7 +314,8 @@ namespace pyrlk
                 for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
                 {
                     work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
+                    work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f);
 
                     work_type diff = J_val - I_val;
 
@@ -749,6 +531,27 @@ namespace pyrlk
         }
     } // __global__ void sparseKernel_
 
+    // Specialization for non float data, cudaFilterModeLinear only compatible with cudaReadModeNormalizedFloat.
+    template<int cn, class T> class TextureLinear : public cv::cudev::Texture<typename TypeVec<T, cn>::vec_type, typename TypeVec<float, cn>::vec_type> {
+    public:
+        typedef typename TypeVec<T, cn>::vec_type elem_type;
+        typedef typename TypeVec<float, cn>::vec_type ret_type;
+        __host__ TextureLinear(PtrStepSz<elem_type> src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) :
+            cv::cudev::Texture<elem_type, ret_type>(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeNormalizedFloat)
+        {
+        }
+    };
+
+    // Specialization for float data, cudaReadModeNormalizedFloat only compatible with cudaReadModeElementType.
+    template<int cn> class TextureLinear<cn, float> : public cv::cudev::Texture<typename TypeVec<float, cn>::vec_type, typename TypeVec<float, cn>::vec_type>
+    {
+    public:
+        typedef typename TypeVec<float, cn>::vec_type float_type;
+        __host__ TextureLinear(PtrStepSz<float_type> src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) :
+            cv::cudev::Texture <float_type, float_type>(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeElementType)
+        {
+        }
+    };
 
     template <int cn, int PATCH_X, int PATCH_Y, typename T> class sparse_caller
     {
@@ -756,16 +559,16 @@ namespace pyrlk
         static void call(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
             int level, dim3 block, cudaStream_t stream)
         {
+            typedef typename TypeVec<T, cn>::vec_type dType;
+            typedef typename TypeVec<float, cn>::vec_type rType;
+            TextureLinear<cn,T> texI(I);
+            TextureLinear<cn,T> texJ(J);
             dim3 grid(ptcount);
-            CV_UNUSED(I);
-            CV_UNUSED(J);
             if (level == 0 && err)
-                sparseKernel<cn, PATCH_X, PATCH_Y, true, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
+                sparseKernel<cn, PATCH_X, PATCH_Y, true, T, cv::cudev::TexturePtr<dType,rType>> << <grid, block, 0, stream >> > (texI, texJ, prevPts, nextPts, status, err, level, rows, cols);
             else
-                sparseKernel<cn, PATCH_X, PATCH_Y, false, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
-
+                sparseKernel<cn, PATCH_X, PATCH_Y, false, T, cv::cudev::TexturePtr<dType, rType>> << <grid, block, 0, stream >> > (texI, texJ, prevPts, nextPts, status, err, level, rows, cols);
             cudaSafeCall(cudaGetLastError());
-
             if (stream == 0)
                 cudaSafeCall(cudaDeviceSynchronize());
         }
@@ -903,8 +706,8 @@ namespace pyrlk
     };
 
 
-    template <bool calcErr>
-    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
+    template <bool calcErr, class Ptr2D>
+    __global__ void denseKernel(const Ptr2D texI, const Ptr2D texJ, PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
     {
         extern __shared__ int smem[];
 
@@ -925,15 +728,15 @@ namespace pyrlk
                 float x = xBase - c_halfWin_x + j + 0.5f;
                 float y = yBase - c_halfWin_y + i + 0.5f;
 
-                I_patch[i * patchWidth + j] = tex2D(tex_If, x, y);
+                I_patch[i * patchWidth + j] = texI(y, x);
 
                 // Scharr Deriv
 
-                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x+1, y-1) + 10 * tex2D(tex_If, x+1, y) + 3 * tex2D(tex_If, x+1, y+1) -
-                                                (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x-1, y) + 3 * tex2D(tex_If, x-1, y+1));
+                dIdx_patch[i * patchWidth + j] = 3 * texI(y - 1, x + 1) + 10 * texI(y, x + 1) + 3 * texI(y + 1, x + 1) -
+                    (3 * texI(y - 1, x - 1) + 10 * texI(y, x - 1) + 3 * texI(y + 1, x - 1));
 
-                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x-1, y+1) + 10 * tex2D(tex_If, x, y+1) + 3 * tex2D(tex_If, x+1, y+1) -
-                                                (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x, y-1) + 3 * tex2D(tex_If, x+1, y-1));
+                dIdy_patch[i * patchWidth + j] = 3 * texI(y + 1, x - 1) + 10 * texI(y + 1,x) + 3 * texI(y+ 1, x + 1) -
+                    (3 * texI(y - 1, x - 1) + 10 * texI(y - 1,x) + 3 * texI(y - 1, x + 1));
             }
         }
 
@@ -1004,7 +807,7 @@ namespace pyrlk
                 for (int j = 0; j < c_winSize_x; ++j)
                 {
                     int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+                    int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f);
 
                     int diff = (J - I) * 32;
 
@@ -1040,7 +843,8 @@ namespace pyrlk
                 for (int j = 0; j < c_winSize_x; ++j)
                 {
                     int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+
+                    int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f);
 
                     errval += ::abs(J - I);
                 }
@@ -1109,9 +913,6 @@ namespace pyrlk
                 { sparse_caller<cn, 1, 5,T>::call, sparse_caller<cn, 2, 5,T>::call, sparse_caller<cn, 3, 5,T>::call, sparse_caller<cn, 4, 5,T>::call, sparse_caller<cn, 5, 5,T>::call }
             };
 
-            Tex_I<cn, T>::bindTexture_(I);
-            Tex_J<cn, T>::bindTexture_(J);
-
             funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
                 level, block, stream);
         }
@@ -1119,9 +920,8 @@ namespace pyrlk
         {
             dim3 block(16, 16);
             dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
-            Tex_I<1, T>::bindTexture_(I);
-            Tex_J<1, T>::bindTexture_(J);
-
+            TextureLinear<1, T> texI(I);
+            TextureLinear<1, T> texJ(J);
             int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
             const int patchWidth = block.x + 2 * halfWin.x;
             const int patchHeight = block.y + 2 * halfWin.y;
@@ -1129,12 +929,12 @@ namespace pyrlk
 
             if (err.data)
             {
-                denseKernel<true> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, err, I.rows, I.cols);
+                denseKernel<true, cv::cudev::TexturePtr<T,float>> << <grid, block, smem_size, stream >> >(texI, texJ, u, v, prevU, prevV, err, I.rows, I.cols);
                 cudaSafeCall(cudaGetLastError());
             }
             else
             {
-                denseKernel<false> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
+                denseKernel<false, cv::cudev::TexturePtr<T, float>> << <grid, block, smem_size, stream >> >(texI, texJ, u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
                 cudaSafeCall(cudaGetLastError());
             }
 
diff --git a/modules/cudaoptflow/src/cuda/tvl1flow.cu b/modules/cudaoptflow/src/cuda/tvl1flow.cu
index 7ee7b36e096..cc73d463197 100644
--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@@ -46,6 +46,7 @@
 #include "opencv2/core/cuda/border_interpolate.hpp"
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 using namespace cv::cuda;
 using namespace cv::cuda::device;
@@ -102,63 +103,8 @@ namespace tvl1flow
         }
     }
 
-    struct SrcTex
-    {
-        virtual ~SrcTex() {}
-
-        __device__ __forceinline__ virtual float I1(float x, float y) const = 0;
-        __device__ __forceinline__ virtual float I1x(float x, float y) const = 0;
-        __device__ __forceinline__ virtual float I1y(float x, float y) const = 0;
-    };
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
-    struct SrcTexRef : SrcTex
-    {
-        __device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D(tex_I1, x, y);
-        }
-        __device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D(tex_I1x, x, y);
-        }
-        __device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D(tex_I1y, x, y);
-        }
-    };
-
-    struct SrcTexObj : SrcTex
-    {
-        __host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_)
-            : tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {}
-
-        __device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D<float>(tex_obj_I1, x, y);
-        }
-        __device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D<float>(tex_obj_I1x, x, y);
-        }
-        __device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D<float>(tex_obj_I1y, x, y);
-        }
-
-        cudaTextureObject_t tex_obj_I1;
-        cudaTextureObject_t tex_obj_I1x;
-        cudaTextureObject_t tex_obj_I1y;
-    };
-
-    template <
-        typename T,
-        typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
-    >
     __global__ void warpBackwardKernel(
-        const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2,
+        const PtrStepSzf I0, const cv::cudev::TexturePtr<float> I1, const cv::cudev::TexturePtr<float> I1x, const cv::cudev::TexturePtr<float> I1y, const PtrStepf u1, const PtrStepf u2,
         PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
     {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -189,11 +135,9 @@ namespace tvl1flow
             for (int cx = xmin; cx <= xmax; ++cx)
             {
                 const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
-
-                sum  += w * src.I1(cx, cy);
-                sumx += w * src.I1x(cx, cy);
-                sumy += w * src.I1y(cx, cy);
-
+                sum  += w * I1(cy, cx);
+                sumx += w * I1x(cy, cx);
+                sumy += w * I1y(cy, cx);
                 wsum += w;
             }
         }
@@ -224,49 +168,14 @@ namespace tvl1flow
                       PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
                       cudaStream_t stream)
     {
+        cv::cudev::Texture<float> texI1(I1);
+        cv::cudev::Texture<float> texI1x(I1x);
+        cv::cudev::Texture<float> texI1y(I1y);
         const dim3 block(32, 8);
         const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
-
-        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
-
-        if (cc30)
-        {
-            cudaTextureDesc texDesc;
-            memset(&texDesc, 0, sizeof(texDesc));
-            texDesc.addressMode[0] = cudaAddressModeClamp;
-            texDesc.addressMode[1] = cudaAddressModeClamp;
-            texDesc.addressMode[2] = cudaAddressModeClamp;
-
-            cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0;
-
-            createTextureObjectPitch2D(&texObj_I1, I1, texDesc);
-            createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc);
-            createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc);
-
-            warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho);
-            cudaSafeCall(cudaGetLastError());
-
-            if (!stream)
-                cudaSafeCall(cudaDeviceSynchronize());
-            else
-                cudaSafeCall(cudaStreamSynchronize(stream));
-
-            cudaSafeCall(cudaDestroyTextureObject(texObj_I1));
-            cudaSafeCall(cudaDestroyTextureObject(texObj_I1x));
-            cudaSafeCall(cudaDestroyTextureObject(texObj_I1y));
-        }
-        else
-        {
-            bindTexture(&tex_I1, I1);
-            bindTexture(&tex_I1x, I1x);
-            bindTexture(&tex_I1y, I1y);
-
-            warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho);
-            cudaSafeCall(cudaGetLastError());
-
-            if (!stream)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
+        warpBackwardKernel<< <grid, block, 0, stream >> > (I0, texI1, texI1x, texI1y , u1, u2, I1w, I1wx, I1wy, grad, rho);
+        if (!stream)
+            cudaSafeCall(cudaDeviceSynchronize());
     }
 }
 
diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu
index 348556060d1..73df35ff63d 100644
--- a/modules/cudastereo/src/cuda/stereobm.cu
+++ b/modules/cudastereo/src/cuda/stereobm.cu
@@ -43,8 +43,10 @@
 #if !defined CUDA_DISABLER
 
 #include "opencv2/core/cuda/common.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 #include <limits.h>
 
+
 namespace cv { namespace cuda { namespace device
 {
     namespace stereobm
@@ -601,13 +603,12 @@ namespace cv { namespace cuda { namespace device
         /////////////////////////////////// Textureness filtering ////////////////////////////////////////
         //////////////////////////////////////////////////////////////////////////////////////////////////
 
-        texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
-
-        __device__ __forceinline__ float sobel(int x, int y)
+        __device__ __forceinline__ float sobel(cv::cudev::TexturePtr<uchar, float> texSrc, int x, int y)
         {
-            float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
-                         tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
-                         tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
+            float conv = texSrc(y - 1, x - 1) * (-1) + texSrc(y - 1, x + 1) * (1) +
+                texSrc(y, x - 1) * (-2) + texSrc(y, x + 1) * (2) +
+                texSrc(y + 1, x - 1) * (-1) + texSrc(y + 1, x + 1) * (1);
+
             return fabs(conv);
         }
 
@@ -635,7 +636,7 @@ namespace cv { namespace cuda { namespace device
 
         #define RpT (2 * ROWSperTHREAD)  // got experimentally
 
-        __global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold)
+        __global__ void textureness_kernel(cv::cudev::TexturePtr<uchar,float> texSrc, PtrStepSzb disp, int winsz, float threshold)
         {
             int winsz2 = winsz/2;
             int n_dirty_pixels = (winsz2) * 2;
@@ -657,9 +658,9 @@ namespace cv { namespace cuda { namespace device
 
                 for(int i = y - winsz2; i <= y + winsz2; ++i)
                 {
-                    sum += sobel(x - winsz2, i);
+                    sum += sobel(texSrc, x - winsz2, i);
                     if (cols_extra)
-                        sum_extra += sobel(x + blockDim.x - winsz2, i);
+                        sum_extra += sobel(texSrc, x + blockDim.x - winsz2, i);
                 }
                 *cols = sum;
                 if (cols_extra)
@@ -675,12 +676,12 @@ namespace cv { namespace cuda { namespace device
 
                 for(int y = beg_row + 1; y < end_row; ++y)
                 {
-                    sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
+                    sum = sum - sobel(texSrc, x - winsz2, y - winsz2 - 1) + sobel(texSrc, x - winsz2, y + winsz2);
                     *cols = sum;
 
                     if (cols_extra)
                     {
-                        sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
+                        sum_extra = sum_extra - sobel(texSrc, x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(texSrc, x + blockDim.x - winsz2, y + winsz2);
                         *cols_extra = sum_extra;
                     }
 
@@ -697,28 +698,16 @@ namespace cv { namespace cuda { namespace device
         void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
         {
             avgTexturenessThreshold *= winsz * winsz;
-
-            texForTF.filterMode     = cudaFilterModeLinear;
-            texForTF.addressMode[0] = cudaAddressModeWrap;
-            texForTF.addressMode[1] = cudaAddressModeWrap;
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-            cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
-
+            cv::cudev::Texture<unsigned char, float> tex(input, false, cudaFilterModeLinear, cudaAddressModeWrap, cudaReadModeNormalizedFloat);
             dim3 threads(128, 1, 1);
             dim3 grid(1, 1, 1);
-
             grid.x = divUp(input.cols, threads.x);
             grid.y = divUp(input.rows, RpT);
-
             size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
-            textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
+            textureness_kernel<<<grid, threads, smem_size, stream>>>(tex, disp, winsz, avgTexturenessThreshold);
             cudaSafeCall( cudaGetLastError() );
-
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
-
-            cudaSafeCall( cudaUnbindTexture (texForTF) );
         }
     } // namespace stereobm
 }}} // namespace cv { namespace cuda { namespace cudev
diff --git a/modules/cudawarping/src/cuda/remap.cu b/modules/cudawarping/src/cuda/remap.cu
index 79f155ddfb9..8aeaef2d4e6 100644
--- a/modules/cudawarping/src/cuda/remap.cu
+++ b/modules/cudawarping/src/cuda/remap.cu
@@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/filters.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 namespace cv { namespace cuda { namespace device
 {
@@ -108,88 +109,96 @@ namespace cv { namespace cuda { namespace device
             }
         };
 
-        #define OPENCV_CUDA_IMPLEMENT_REMAP_TEX(type) \
-            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_remap_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float*, bool) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar2)
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar4)
-
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(schar)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char2)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char4)
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStreamTex
+        {
+            static void call(PtrStepSz< T > src, PtrStepSz< T > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz< T > dst, const float* borderValue, bool cc20)
+            {
+                typedef typename TypeVec<float, VecTraits< T >::cn>::vec_type work_type;
+                dim3 block(32, cc20 ? 8 : 4);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                if (srcWhole.cols == src.cols && srcWhole.rows == src.rows)
+                {
+                    cudev::Texture<T> texSrcWhole(srcWhole);
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                    BorderReader< cudev::TexturePtr<T>, B<work_type> > brdSrc(texSrcWhole, brd);
+                    Filter< BorderReader<cudev::TexturePtr<T>, B<work_type> > > filter_src(brdSrc);
+                    remap << <grid, block >> > (filter_src, mapx, mapy, dst);
 
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort2)
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort4)
+                }
+                else {
+                    cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                    BorderReader< cudev::TextureOffPtr<T>, B<work_type> > brdSrc(texSrcWhole, brd);
+                    Filter< BorderReader<cudev::TextureOffPtr<T>, B<work_type> > > filter_src(brdSrc);
+                    remap << <grid, block >> > (filter_src, mapx, mapy, dst);
+                }
 
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short2)
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short4)
+                cudaSafeCall( cudaGetLastError() );
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
 
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int2)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int4)
+        template <template <typename> class Filter, typename T> struct RemapDispatcherNonStreamTex<Filter, BrdReplicate, T>
+        {
+            static void call(PtrStepSz< T > src, PtrStepSz< T > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz< T > dst, const float*, bool)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                if (srcWhole.cols == src.cols && srcWhole.rows == src.rows)
+                {
+                    cudev::Texture<T> texSrcWhole(srcWhole);
+                    Filter<cudev::TexturePtr<T>> filter_src(texSrcWhole);
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                }
+                else
+                {
+                    cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
+                    BrdReplicate<T> brd(src.rows, src.cols);
+                    BorderReader< cudev::TextureOffPtr<T>, BrdReplicate<T> > brdSrc(texSrcWhole, brd);
+                    Filter< BorderReader< cudev::TextureOffPtr<T>, BrdReplicate<T> > > filter_src(brdSrc);
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                }
+                cudaSafeCall( cudaGetLastError() );
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
 
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float2)
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float4)
 
-        #undef OPENCV_CUDA_IMPLEMENT_REMAP_TEX
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, uchar> :
+            RemapDispatcherNonStreamTex<Filter, B, uchar> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, uchar4> :
+            RemapDispatcherNonStreamTex<Filter, B, uchar4> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, ushort> :
+            RemapDispatcherNonStreamTex<Filter, B, ushort> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, ushort4> :
+            RemapDispatcherNonStreamTex<Filter, B, ushort4> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, short> :
+            RemapDispatcherNonStreamTex<Filter, B, short> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, short4> :
+            RemapDispatcherNonStreamTex<Filter, B, short4> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, float> :
+            RemapDispatcherNonStreamTex<Filter, B, float> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, float4> :
+            RemapDispatcherNonStreamTex<Filter, B, float4> {};
+
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, uchar> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, uchar> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, uchar4> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, uchar4> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, ushort> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, ushort> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, ushort4> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, ushort4> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, short> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, short> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, short4> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, short4> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, float> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, float> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, float4> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, float4> {};
 
         template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
         {
@@ -239,32 +248,18 @@ namespace cv { namespace cuda { namespace device
         }
 
         template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
         template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
         template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
         template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
     } // namespace imgproc
diff --git a/modules/cudawarping/src/cuda/resize.cu b/modules/cudawarping/src/cuda/resize.cu
index 7285a474870..1f5e75e2057 100644
--- a/modules/cudawarping/src/cuda/resize.cu
+++ b/modules/cudawarping/src/cuda/resize.cu
@@ -49,6 +49,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/filters.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 namespace cv { namespace cuda { namespace device
 {
@@ -105,7 +106,7 @@ namespace cv { namespace cuda { namespace device
         }
     }
 
-    template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
+    template <class Ptr2D, typename T> __global__ void resize(Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
     {
         const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
         const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -130,54 +131,6 @@ namespace cv { namespace cuda { namespace device
         }
     }
 
-    // textures
-
-    template <typename T> struct TextureAccessor;
-
-    #define OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(type) \
-        texture<type, cudaTextureType2D, cudaReadModeElementType> tex_resize_##type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-        template <> struct TextureAccessor<type> \
-        { \
-            typedef type elem_type; \
-            typedef int index_type; \
-            int xoff; \
-            int yoff; \
-            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-            { \
-                return tex2D(tex_resize_##type, x + xoff, y + yoff); \
-            } \
-            __host__ static void bind(const PtrStepSz<type>& mat) \
-            { \
-                bindTexture(&tex_resize_##type, mat); \
-            } \
-        };
-
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar)
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar4)
-
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort)
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort4)
-
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short)
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short4)
-
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float)
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float4)
-
-    #undef OPENCV_CUDA_IMPLEMENT_RESIZE_TEX
-
-    template <typename T>
-    TextureAccessor<T> texAccessor(const PtrStepSz<T>& mat, int yoff, int xoff)
-    {
-        TextureAccessor<T>::bind(mat);
-
-        TextureAccessor<T> t;
-        t.xoff = xoff;
-        t.yoff = yoff;
-
-        return t;
-    }
-
     // callers for nearest interpolation
 
     template <typename T>
@@ -194,14 +147,19 @@ namespace cv { namespace cuda { namespace device
     }
 
     template <typename T>
-    void call_resize_nearest_tex(const PtrStepSz<T>& /*src*/, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    void call_resize_nearest_tex(const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        resize<<<grid, block>>>(texAccessor(srcWhole, yoff, xoff), dst, fy, fx);
+        if (xoff || yoff) {
+            cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
+            resize<cudev::TextureOffPtr<T>> << <grid, block >> > (texSrcWhole, dst, fy, fx);
+        }
+        else {
+            cudev::Texture<T> texSrcWhole(srcWhole);
+            resize<cudev::TexturePtr<T>> << <grid, block >> > (texSrcWhole, dst, fy, fx);
+        }
         cudaSafeCall( cudaGetLastError() );
-
         cudaSafeCall( cudaDeviceSynchronize() );
     }
 
@@ -225,27 +183,21 @@ namespace cv { namespace cuda { namespace device
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
         if (srcWhole.data == src.data)
         {
-            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
-            LinearFilter< TextureAccessor<T> > filteredSrc(texSrc);
-
-            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+            cudev::Texture<T> texSrc(src);
+            LinearFilter< cudev::TexturePtr<T> > filteredSrc(texSrc);
+            resize << <grid, block >> > (filteredSrc, dst, fy, fx);
         }
         else
         {
-            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
-
+            cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
             BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
-            LinearFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
-
-            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+            BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T> > brdSrc(texSrcWhole, brd);
+            LinearFilter< BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+            resize << <grid, block >> > (filteredSrc, dst, fy, fx);
         }
-
         cudaSafeCall( cudaGetLastError() );
-
         cudaSafeCall( cudaDeviceSynchronize() );
     }
 
@@ -273,27 +225,21 @@ namespace cv { namespace cuda { namespace device
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
         if (srcWhole.data == src.data)
         {
-            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
-            CubicFilter< TextureAccessor<T> > filteredSrc(texSrc);
-
-            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+            cudev::Texture<T> texSrc(src);
+            CubicFilter< cudev::TexturePtr<T> > filteredSrc(texSrc);
+            resize << <grid, block >> > (filteredSrc, dst, fy, fx);
         }
         else
         {
-            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
-
+            cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
             BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
-            CubicFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
-
-            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+            BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T> > brdSrc(texSrcWhole, brd);
+            CubicFilter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T> >> filteredSrc(brdSrc);
+            resize << <grid, block >> > (filteredSrc, dst, fy, fx);
         }
-
         cudaSafeCall( cudaGetLastError() );
-
         cudaSafeCall( cudaDeviceSynchronize() );
     }
 
@@ -318,7 +264,7 @@ namespace cv { namespace cuda { namespace device
                 if (fx > 1 || fy > 1)
                     call_resize_nearest_glob(src, dst, fy, fx, 0);
                 else
-                    call_resize_nearest_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
+                   call_resize_nearest_tex(srcWhole, yoff, xoff, dst, fy, fx);
             }
         }
     };
@@ -389,7 +335,7 @@ namespace cv { namespace cuda { namespace device
         {
             if (stream)
                 call_resize_cubic_glob(src, dst, fy, fx, stream);
-            else
+           else
                 call_resize_cubic_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
         }
     };
diff --git a/modules/cudawarping/src/cuda/warp.cu b/modules/cudawarping/src/cuda/warp.cu
index 51da3d478f1..9b894f7630b 100644
--- a/modules/cudawarping/src/cuda/warp.cu
+++ b/modules/cudawarping/src/cuda/warp.cu
@@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/filters.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 namespace cv { namespace cuda { namespace device
 {
@@ -196,86 +197,48 @@ namespace cv { namespace cuda { namespace device
             }
         };
 
-        #define OPENCV_CUDA_IMPLEMENT_WARP_TEX(type) \
-            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_warp_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, const float warpMat[Transform::rows*3], bool) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar2)
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar4)
-
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(schar)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(char2)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(char4)
-
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort2)
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort4)
-
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(short)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(short2)
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(short4)
-
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int2)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int4)
-
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(float)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(float2)
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(float4)
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStreamTex
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+                dim3 block(32, cc20 ? 8 : 4);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                if (xoff || yoff) {
+                    cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                    BorderReader< cudev::TextureOffPtr<T>, B<work_type> > brdSrc(texSrcWhole, brd);
+                    Filter< BorderReader< cudev::TextureOffPtr<T>, B<work_type> > > filter_src(brdSrc);
+                    warp<Transform> << <grid, block >> > (filter_src, dst, warpMat);
+                }
+                else {
+                    cudev::Texture<T> texSrcWhole(srcWhole);
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                    BorderReader< cudev::TexturePtr<T>, B<work_type> > brdSrc(texSrcWhole, brd);
+                    Filter< BorderReader< cudev::TexturePtr<T>, B<work_type> > > filter_src(brdSrc);
+                    warp<Transform> << <grid, block >> > (filter_src, dst, warpMat);
+                }
+                cudaSafeCall( cudaGetLastError() );
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
 
-        #undef OPENCV_CUDA_IMPLEMENT_WARP_TEX
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, uchar> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, uchar> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, uchar4> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, uchar4> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, ushort> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, ushort> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, ushort4> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, ushort4> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, short> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, short> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, short4> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, short4> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, float> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, float> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, float4> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, float4> {};
 
         template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
         {
@@ -330,32 +293,18 @@ namespace cv { namespace cuda { namespace device
         }
 
         template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
         template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
         template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
         template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
@@ -366,32 +315,18 @@ namespace cv { namespace cuda { namespace device
         }
 
         template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
         template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
         template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
-        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
         template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
     } // namespace imgproc
diff --git a/modules/cudawarping/test/test_precomp.hpp b/modules/cudawarping/test/test_precomp.hpp
index 1d80af7229b..acf983b582e 100644
--- a/modules/cudawarping/test/test_precomp.hpp
+++ b/modules/cudawarping/test/test_precomp.hpp
@@ -42,6 +42,8 @@
 #ifndef __OPENCV_TEST_PRECOMP_HPP__
 #define __OPENCV_TEST_PRECOMP_HPP__
 
+#include <thread>
+
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/cuda_test.hpp"
 
diff --git a/modules/cudawarping/test/test_resize.cpp b/modules/cudawarping/test/test_resize.cpp
index 5822f87c037..768ad09f982 100644
--- a/modules/cudawarping/test/test_resize.cpp
+++ b/modules/cudawarping/test/test_resize.cpp
@@ -206,6 +206,60 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeSameAsHost, testing::Combine(
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_AREA)),
     WHOLE_SUBMAT));
 
+PARAM_TEST_CASE(ResizeTextures, cv::cuda::DeviceInfo, Interpolation)
+{
+    cv::cuda::DeviceInfo devInfo;
+    Interpolation interpolation;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        interpolation = GET_PARAM(1);
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+void ResizeThread(const Interpolation interp, const GpuMat& imgIn, const std::vector<GpuMat>& imgsOut, Stream& stream) {
+    for (auto& imgOut : imgsOut)
+        cv::cuda::resize(imgIn, imgOut, imgOut.size(), 0, 0, interp, stream);
+}
+
+CUDA_TEST_P(ResizeTextures, Accuracy)
+{
+    constexpr int nThreads = 5;
+    constexpr int nIters = 5;
+    const Size szIn(100, 100);
+    const Size szOut(200, 200);
+    vector<Stream> streams(nThreads, cv::cuda::Stream::Null());
+    vector<GpuMat> imgsIn;
+    vector<vector<GpuMat>> imgsOut;
+    for (int i = 0; i < nThreads; i++) {
+        imgsIn.push_back(GpuMat(szIn, CV_8UC1, i));
+        vector<GpuMat> imgsOutPerThread;
+        for (int j = 0; j < nIters; j++)
+            imgsOutPerThread.push_back(GpuMat(szOut, CV_8UC1));
+        imgsOut.push_back(imgsOutPerThread);
+    }
+
+    vector<std::thread> thread(nThreads);
+    for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(ResizeThread, interpolation, std::ref(imgsIn.at(i)), std::ref(imgsOut.at(i)), std::ref(streams.at(i)));
+    for (int i = 0; i < nThreads; i++) thread.at(i).join();
+
+    for (int i = 0; i < nThreads; i++) {
+        GpuMat imgOutGs;
+        cv::cuda::resize(imgsIn.at(i), imgOutGs, szOut, 0, 0, interpolation, streams.at(i));
+        Mat imgOutGsHost; imgOutGs.download(imgOutGsHost);
+        for (const auto& imgOut : imgsOut.at(i)) {
+            Mat imgOutHost; imgOut.download(imgOutHost);
+            ASSERT_TRUE(cv::norm(imgOutHost, imgOutGsHost, NORM_INF) == 0);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeTextures, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+
 
 }} // namespace
 #endif // HAVE_CUDA
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
index fdcc66ca2f7..0e9d69160dd 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
@@ -1,147 +1,159 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma once
-
-#ifndef OPENCV_CUDEV_PTR2D_TEXTURE_HPP
-#define OPENCV_CUDEV_PTR2D_TEXTURE_HPP
-
-#include <cstring>
-#include "../common.hpp"
-#include "glob.hpp"
-#include "gpumat.hpp"
-#include "traits.hpp"
-
-#if CUDART_VERSION >= 5050
-
-namespace
-{
-    template <typename T> struct CvCudevTextureRef
-    {
-        typedef texture<T, cudaTextureType2D, cudaReadModeElementType> TexRef;
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 
-        static TexRef ref;
+#ifndef OPENCV_CUDEV_PTR2D_TEXTURE_OBJECT_HPP
+#define OPENCV_CUDEV_PTR2D_TEXTURE_OBJECT_HPP
 
-        __host__ static void bind(const cv::cudev::GlobPtrSz<T>& mat,
-                                  bool normalizedCoords = false,
-                                  cudaTextureFilterMode filterMode = cudaFilterModePoint,
-                                  cudaTextureAddressMode addressMode = cudaAddressModeClamp)
-        {
-            ref.normalized = normalizedCoords;
-            ref.filterMode = filterMode;
-            ref.addressMode[0] = addressMode;
-            ref.addressMode[1] = addressMode;
-            ref.addressMode[2] = addressMode;
+#include <opencv2/core.hpp>
+#include <opencv2/core/utils/logger.hpp>
+#include <opencv2/core/cuda_types.hpp>
+#include <opencv2/cudev/common.hpp>
+#include <opencv2/cudev/ptr2d/traits.hpp>
 
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+/** \file texture_object.hpp
+*/
 
-            CV_CUDEV_SAFE_CALL( cudaBindTexture2D(0, &ref, mat.data, &desc, mat.cols, mat.rows, mat.step) );
+namespace cv {  namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+    /** @brief Simple lightweight structures that encapsulate information about an image texture on the device.
+    * They are intended to be passed to nvcc-compiled code.
+    */
+    template<class T, class R = T>
+    struct TexturePtr {
+        typedef R     elem_type, value_type;
+        typedef float index_type;
+        __host__ TexturePtr() {};
+        __host__ TexturePtr(const cudaTextureObject_t tex_) : tex(tex_) {};
+        __device__ __forceinline__ R operator ()(index_type y, index_type x) const {
+            return tex2D<R>(tex, x, y);
+        }
+        __device__ __forceinline__ R operator ()(index_type x) const {
+            return tex1Dfetch<R>(tex, x);
         }
+    private:
+        cudaTextureObject_t tex;
+    };
 
-        __host__ static void unbind()
-        {
-            cudaUnbindTexture(ref);
+    // textures are a maximum of 32 bits wide, 64 bits is read as two 32 bit wide values
+    template <class R>
+    struct TexturePtr<uint64, R> {
+        typedef float index_type;
+        __host__ TexturePtr() {};
+        __host__ TexturePtr(const cudaTextureObject_t tex_) : tex(tex_) {};
+        __device__ __forceinline__ R operator ()(index_type y, index_type x) const {
+            const uint2 retVal = tex2D<uint2>(tex, x, y);
+            return *(reinterpret_cast<const R*>(&retVal));
+        }
+        __device__ __forceinline__ R operator ()(index_type x) const {
+            const uint2 retVal = tex1Dfetch<uint2>(tex, x);
+            return *(reinterpret_cast<const R*>(&retVal));
         }
+    private:
+        cudaTextureObject_t tex;
     };
 
-    template <typename T>
-    typename CvCudevTextureRef<T>::TexRef CvCudevTextureRef<T>::ref;
-}
+    template<class T, class R = T>
+    struct TextureOffPtr {
+        typedef R     elem_type;
+        typedef float index_type;
+        __host__ TextureOffPtr(const cudaTextureObject_t tex_, const int yoff_, const int xoff_) : tex(tex_), yoff(yoff_), xoff(xoff_) {};
+        __device__ __forceinline__ R operator ()(index_type y, index_type x) const {
+            return tex2D<R>(tex, x + xoff, y + yoff);
+        }
+    private:
+        cudaTextureObject_t tex;
+        int xoff = 0;
+        int yoff = 0;
+    };
 
-#endif
+    /** @brief non-copyable smart CUDA texture object
+    *
+    * UniqueTexture is a smart non-sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
+    */
+    template<class T, class R = T>
+    class UniqueTexture {
+    public:
+        __host__ UniqueTexture() noexcept { }
+        __host__ UniqueTexture(UniqueTexture&) = delete;
+        __host__ UniqueTexture(UniqueTexture&& other) noexcept {
+            tex = other.tex;
+            other.tex = 0;
+        }
 
-namespace cv { namespace cudev {
+        __host__ UniqueTexture(const int rows, const int cols, T* data, const size_t step, const bool normalizedCoords = false,
+            const cudaTextureFilterMode filterMode = cudaFilterModePoint, const cudaTextureAddressMode addressMode = cudaAddressModeClamp,
+            const cudaTextureReadMode readMode = cudaReadModeElementType)
+        {
+            create(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode);
+        }
 
-//! @addtogroup cudev
-//! @{
+        __host__ UniqueTexture(const size_t sizeInBytes, T* data, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType)
+        {
+            create(1, static_cast<int>(sizeInBytes/sizeof(T)), data, sizeInBytes, normalizedCoords, filterMode, addressMode, readMode);
+        }
 
-#if CUDART_VERSION >= 5050
+        __host__ ~UniqueTexture() {
+            if (tex != cudaTextureObject_t()) {
+                try {
+                    CV_CUDEV_SAFE_CALL(cudaDestroyTextureObject(tex));
+                }
+                catch (const cv::Exception& ex) {
+                    std::ostringstream os;
+                    os << "Exception caught during CUDA texture object destruction.\n";
+                    os << ex.what();
+                    os << "Exception will be ignored.\n";
+                    CV_LOG_WARNING(0, os.str().c_str());
+                }
+            }
 
-template <typename T> struct TexturePtr
-{
-    typedef T     value_type;
-    typedef float index_type;
+        }
 
-    cudaTextureObject_t texObj;
+        __host__ UniqueTexture& operator=(const UniqueTexture&) = delete;
+        __host__ UniqueTexture& operator=(UniqueTexture&& other) noexcept {
+            CV_Assert(other);
+            if (&other != this) {
+                UniqueTexture(std::move(*this)); /* destroy current texture object */
+                tex = other.tex;
+                other.tex = cudaTextureObject_t();
+            }
+            return *this;
+        }
 
-    __device__ __forceinline__ T operator ()(float y, float x) const
-    {
-    #if CV_CUDEV_ARCH < 300
-        // Use the texture reference
-        return tex2D(CvCudevTextureRef<T>::ref, x, y);
-    #else
-        // Use the texture object
-        return tex2D<T>(texObj, x, y);
-    #endif
-    }
-};
-
-template <typename T> struct Texture : TexturePtr<T>
-{
-    int rows, cols;
-    bool cc30;
-
-    __host__ explicit Texture(const GlobPtrSz<T>& mat,
-                              bool normalizedCoords = false,
-                              cudaTextureFilterMode filterMode = cudaFilterModePoint,
-                              cudaTextureAddressMode addressMode = cudaAddressModeClamp)
-    {
-        cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
+        __host__ cudaTextureObject_t get() const noexcept {
+            CV_Assert(tex);
+            return tex;
+        }
+
+        __host__ explicit operator bool() const noexcept { return tex != cudaTextureObject_t(); }
 
-        rows = mat.rows;
-        cols = mat.cols;
+    private:
 
-        if (cc30)
+        template <class T1>
+        __host__ void create(const int rows, const int cols, T1* data, const size_t step, const bool normalizedCoords, const cudaTextureFilterMode filterMode,
+            const cudaTextureAddressMode addressMode, const cudaTextureReadMode readMode)
         {
-            // Use the texture object
             cudaResourceDesc texRes;
             std::memset(&texRes, 0, sizeof(texRes));
-            texRes.resType = cudaResourceTypePitch2D;
-            texRes.res.pitch2D.devPtr = mat.data;
-            texRes.res.pitch2D.height = mat.rows;
-            texRes.res.pitch2D.width = mat.cols;
-            texRes.res.pitch2D.pitchInBytes = mat.step;
-            texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+            if (rows == 1) {
+                CV_Assert(rows == 1 && cols*sizeof(T) == step);
+                texRes.resType = cudaResourceTypeLinear;
+                texRes.res.linear.devPtr = data;
+                texRes.res.linear.sizeInBytes = step;
+                texRes.res.linear.desc = cudaCreateChannelDesc<T1>();
+            }
+            else {
+                texRes.resType = cudaResourceTypePitch2D;
+                texRes.res.pitch2D.devPtr = data;
+                texRes.res.pitch2D.height = rows;
+                texRes.res.pitch2D.width = cols;
+                texRes.res.pitch2D.pitchInBytes = step;
+                texRes.res.pitch2D.desc = cudaCreateChannelDesc<T1>();
+            }
 
             cudaTextureDesc texDescr;
             std::memset(&texDescr, 0, sizeof(texDescr));
@@ -150,109 +162,112 @@ template <typename T> struct Texture : TexturePtr<T>
             texDescr.addressMode[0] = addressMode;
             texDescr.addressMode[1] = addressMode;
             texDescr.addressMode[2] = addressMode;
-            texDescr.readMode = cudaReadModeElementType;
+            texDescr.readMode = readMode;
 
-            CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
+            CV_CUDEV_SAFE_CALL(cudaCreateTextureObject(&tex, &texRes, &texDescr, 0));
         }
-        else
+
+        __host__ void create(const int rows, const int cols, uint64* data, const size_t step, const bool normalizedCoords, const cudaTextureFilterMode filterMode,
+            const cudaTextureAddressMode addressMode, const cudaTextureReadMode readMode)
         {
-            // Use the texture reference
-            CvCudevTextureRef<T>::bind(mat, normalizedCoords, filterMode, addressMode);
+            create<uint2>(rows, cols, (uint2*)data, step, normalizedCoords, filterMode, addressMode, readMode);
         }
-    }
 
-    __host__ ~Texture()
-    {
-        if (cc30)
+    private:
+        cudaTextureObject_t tex;
+    };
+
+    /** @brief sharable smart CUDA texture object
+    *
+    * Texture is a smart sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
+    */
+    template<class T, class R = T>
+    class Texture {
+    public:
+        Texture() = default;
+        Texture(const Texture&) = default;
+        Texture(Texture&&) = default;
+
+        __host__ Texture(const int rows_, const int cols_, T* data, const size_t step, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
+            rows(rows_), cols(cols_), texture(std::make_shared<UniqueTexture<T,R>>(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode))
         {
-            // Use the texture object
-            cudaDestroyTextureObject(this->texObj);
         }
-        else
+
+        __host__ Texture(const size_t sizeInBytes, T* data, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
+            rows(1), cols(static_cast<int>(sizeInBytes/sizeof(T))), texture(std::make_shared<UniqueTexture<T, R>>(sizeInBytes, data, normalizedCoords, filterMode, addressMode, readMode))
         {
-            // Use the texture reference
-            CvCudevTextureRef<T>::unbind();
         }
-    }
-};
 
-template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
-{
-};
+        __host__ Texture(PtrStepSz<T> src, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
+            Texture(src.rows, src.cols, src.data, src.step, normalizedCoords, filterMode, addressMode, readMode)
+        {
+        }
 
-#else
+        Texture& operator=(const Texture&) = default;
+        Texture& operator=(Texture&&) = default;
 
-template <typename T> struct TexturePtr
-{
-    typedef T     value_type;
-    typedef float index_type;
+        __host__ explicit operator bool() const noexcept {
+            if (!texture)
+                return false;
+            return texture->operator bool();
+        }
 
-    cudaTextureObject_t texObj;
+        __host__ operator TexturePtr<T, R>() const {
+            if (texture)
+                return TexturePtr<T, R>(texture->get());
+            else
+                return TexturePtr<T, R>(cudaTextureObject_t());
+        }
 
-    __device__ __forceinline__ T operator ()(float y, float x) const
-    {
-    #if CV_CUDEV_ARCH >= 300
-        // Use the texture object
-        return tex2D<T>(texObj, x, y);
-    #else
-        CV_UNUSED(y);
-        CV_UNUSED(x);
-        return T();
-    #endif
-    }
-};
-
-template <typename T> struct Texture : TexturePtr<T>
-{
-    int rows, cols;
-
-    __host__ explicit Texture(const GlobPtrSz<T>& mat,
-                              bool normalizedCoords = false,
-                              cudaTextureFilterMode filterMode = cudaFilterModePoint,
-                              cudaTextureAddressMode addressMode = cudaAddressModeClamp)
-    {
-        CV_Assert( deviceSupports(FEATURE_SET_COMPUTE_30) );
-
-        rows = mat.rows;
-        cols = mat.cols;
-
-        // Use the texture object
-        cudaResourceDesc texRes;
-        std::memset(&texRes, 0, sizeof(texRes));
-        texRes.resType = cudaResourceTypePitch2D;
-        texRes.res.pitch2D.devPtr = mat.data;
-        texRes.res.pitch2D.height = mat.rows;
-        texRes.res.pitch2D.width = mat.cols;
-        texRes.res.pitch2D.pitchInBytes = mat.step;
-        texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
-
-        cudaTextureDesc texDescr;
-        std::memset(&texDescr, 0, sizeof(texDescr));
-        texDescr.normalizedCoords = normalizedCoords;
-        texDescr.filterMode = filterMode;
-        texDescr.addressMode[0] = addressMode;
-        texDescr.addressMode[1] = addressMode;
-        texDescr.addressMode[2] = addressMode;
-        texDescr.readMode = cudaReadModeElementType;
-
-        CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
-    }
-
-    __host__ ~Texture()
+        int rows = 0;
+        int cols = 0;
+
+    protected:
+        std::shared_ptr<UniqueTexture<T, R>> texture = 0;
+    };
+
+    template <typename T, typename R> struct PtrTraits< Texture<T, R> > : PtrTraitsBase<Texture<T, R>, TexturePtr<T, R> >
     {
-        // Use the texture object
-        cudaDestroyTextureObject(this->texObj);
-    }
-};
+    };
 
-template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
-{
-};
 
-#endif
+    /** @brief sharable smart CUDA texture object with offset
+    * TextureOff is a smart sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
+    */
+    template<class T, class R = T>
+    class TextureOff {
+    public:
+        TextureOff(const TextureOff&) = default;
+        TextureOff(TextureOff&&) = default;
+
+        __host__ TextureOff(const int rows, const int cols, T* data, const size_t step, const int yoff_ = 0, const int xoff_ = 0, const bool normalizedCoords = false,
+            const cudaTextureFilterMode filterMode = cudaFilterModePoint, const cudaTextureAddressMode addressMode = cudaAddressModeClamp,
+            const cudaTextureReadMode readMode = cudaReadModeElementType) :
+            texture(std::make_shared<UniqueTexture<T, R>>(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode)), xoff(xoff_), yoff(yoff_)
+        {
+        }
 
-//! @}
+        __host__ TextureOff(PtrStepSz<T> src, const int yoff = 0, const int xoff = 0, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
+            TextureOff(src.rows, src.cols, src.data, src.step, yoff, xoff, normalizedCoords, filterMode, addressMode, readMode)
+        {
+        }
+
+        TextureOff& operator=(const TextureOff&) = default;
+        TextureOff& operator=(TextureOff&&) = default;
 
+        __host__ operator TextureOffPtr<T, R>() const {
+            return TextureOffPtr<T, R>(texture->get(), yoff, xoff);
+        }
+
+    private:
+        int xoff = 0;
+        int yoff = 0;
+        std::shared_ptr<UniqueTexture<T, R>> texture = 0;
+    };
 }}
 
 #endif
diff --git a/modules/xfeatures2d/src/cuda/surf.cu b/modules/xfeatures2d/src/cuda/surf.cu
index 2ebd6ff0833..007a1f60d0e 100644
--- a/modules/xfeatures2d/src/cuda/surf.cu
+++ b/modules/xfeatures2d/src/cuda/surf.cu
@@ -51,6 +51,7 @@
 #include "opencv2/core/cuda/utility.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/filters.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 
 namespace cv { namespace cuda { namespace device
 {
@@ -59,23 +60,19 @@ namespace cv { namespace cuda { namespace device
         void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
         void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
 
-        void bindImgTex(PtrStepSzb img);
-        size_t bindSumTex(PtrStepSz<unsigned int> sum);
-        size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
-
-        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
             int octave, int nOctaveLayer);
 
-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
             int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
 
         void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
             float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
             unsigned int* featureCounter);
 
-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
+        void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
 
-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
+        void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
     }
 }}}
 
@@ -121,34 +118,8 @@ namespace cv { namespace cuda { namespace device
             cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
         }
 
-        ////////////////////////////////////////////////////////////////////////
-        // Integral image texture
-
-        texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        void bindImgTex(PtrStepSzb img)
-        {
-            bindTexture(&imgTex, img);
-        }
-
-        size_t bindSumTex(PtrStepSz<uint> sum)
-        {
-            size_t offset;
-            cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
-            cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
-            return offset / sizeof(uint);
-        }
-        size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
-        {
-            size_t offset;
-            cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
-            cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
-            return offset / sizeof(uint);
-        }
 
-        template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
+        template <int N> __device__ float icvCalcHaarPatternSum(cudev::TexturePtr<unsigned int> texSum, const float src[][5], int oldSize, int newSize, int y, int x)
         {
         #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
             typedef double real_t;
@@ -169,10 +140,10 @@ namespace cv { namespace cuda { namespace device
                 int dy2 = __float2int_rn(ratio * src[k][3]);
 
                 real_t t = 0;
-                t += tex2D(sumTex, x + dx1, y + dy1);
-                t -= tex2D(sumTex, x + dx1, y + dy2);
-                t -= tex2D(sumTex, x + dx2, y + dy1);
-                t += tex2D(sumTex, x + dx2, y + dy2);
+                t += texSum(y + dy1, x + dx1);
+                t -= texSum(y + dy2, x + dx1);
+                t -= texSum(y + dy1, x + dx2);
+                t += texSum(y + dy2, x + dx2);
 
                 d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
             }
@@ -201,7 +172,7 @@ namespace cv { namespace cuda { namespace device
             return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
         }
 
-        __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
+        __global__ void icvCalcLayerDetAndTrace(cudev::TexturePtr<unsigned int> texSum, PtrStepf det, PtrStepf trace)
         {
             // Determine the indices
             const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
@@ -222,29 +193,29 @@ namespace cv { namespace cuda { namespace device
 
             if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
             {
-                const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
-                const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
-                const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
+                const float dx  = icvCalcHaarPatternSum<3>(texSum, c_DX , 9, size, (i << c_octave), (j << c_octave));
+                const float dy  = icvCalcHaarPatternSum<3>(texSum, c_DY , 9, size, (i << c_octave), (j << c_octave));
+                const float dxy = icvCalcHaarPatternSum<4>(texSum, c_DXY, 9, size, (i << c_octave), (j << c_octave));
 
                 det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
                 trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
             }
         }
 
-        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
             int octave, int nOctaveLayers)
         {
             const int min_size = calcSize(octave, 0);
             const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
             const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
-
+            cudev::Texture<unsigned int> texSum(sum);
             dim3 threads(16, 16);
 
             dim3 grid;
             grid.x = divUp(max_samples_j, threads.x);
             grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
 
-            icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
+            icvCalcLayerDetAndTrace<<<grid, threads>>>(texSum, det, trace);
             cudaSafeCall( cudaGetLastError() );
 
             cudaSafeCall( cudaDeviceSynchronize() );
@@ -255,10 +226,14 @@ namespace cv { namespace cuda { namespace device
 
         __constant__ float c_DM[5] = {0, 0, 9, 9, 1};
 
-        struct WithMask
+        template<bool useMask = true>
+        struct Mask
         {
-            static __device__ bool check(int sum_i, int sum_j, int size)
+            __host__ Mask(){};
+            __host__ Mask(cudev::TexturePtr<unsigned int> tex_): tex(tex_) {};
+            __device__ bool check(int sum_i, int sum_j, int size)
             {
+                if (!useMask) return true;
                 float ratio = (float)size / 9.0f;
 
                 float d = 0;
@@ -269,19 +244,20 @@ namespace cv { namespace cuda { namespace device
                 int dy2 = __float2int_rn(ratio * c_DM[3]);
 
                 float t = 0;
-                t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
-                t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
-                t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
-                t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
+                t += tex(sum_i + dy1, sum_j + dx1);
+                t -= tex(sum_i + dy2, sum_j + dx1);
+                t -= tex(sum_i + dy1, sum_j + dx2);
+                t += tex(sum_i + dy2, sum_j + dx2);
 
                 d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
 
                 return (d >= 0.5f);
             }
+            cudev::TexturePtr<unsigned int> tex;
         };
 
-        template <typename Mask>
-        __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
+        template<class T>
+        __global__ void icvFindMaximaInLayer(T mask, const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
             unsigned int* maxCounter)
         {
             #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
@@ -323,7 +299,7 @@ namespace cv { namespace cuda { namespace device
                     const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
                     const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
 
-                    if (Mask::check(sum_i, sum_j, size))
+                    if (mask.check(sum_i, sum_j, size))
                     {
                         // Check to see if we have a max (in its 26 neighbours)
                         const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
@@ -374,7 +350,7 @@ namespace cv { namespace cuda { namespace device
             #endif
         }
 
-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
             int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
         {
             const int layer_rows = img_rows >> octave;
@@ -390,10 +366,15 @@ namespace cv { namespace cuda { namespace device
 
             const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
 
-            if (use_mask)
-                icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
-            else
-                icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
+            if (use_mask) {
+                cudev::Texture<unsigned int> texMaskSum(maskSum);
+                Mask<true> mask(texMaskSum);
+                icvFindMaximaInLayer<< <grid, threads, smem_size >> > (mask, det, trace, maxPosBuffer, maxCounter);
+            }
+            else {
+                Mask<false> mask;
+                icvFindMaximaInLayer<< <grid, threads, smem_size >> > (mask, det, trace, maxPosBuffer, maxCounter);
+            }
 
             cudaSafeCall( cudaGetLastError() );
 
@@ -539,7 +520,7 @@ namespace cv { namespace cuda { namespace device
         __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
         __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
 
-        __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
+        __global__ void icvCalcOrientation(cudev::TexturePtr<unsigned int> texSum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
         {
             __shared__ float s_X[128];
             __shared__ float s_Y[128];
@@ -576,8 +557,8 @@ namespace cv { namespace cuda { namespace device
                 if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
                     x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
                 {
-                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
-                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
+                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(texSum, c_NX, 4, grad_wav_size, y, x);
+                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(texSum, c_NY, 4, grad_wav_size, y, x);
 
                     angle = atan2f(Y, X);
                     if (angle < 0)
@@ -676,8 +657,9 @@ namespace cv { namespace cuda { namespace device
         #undef ORI_WIN
         #undef ORI_SAMPLES
 
-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
+        void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
         {
+            cudev::Texture<unsigned int> texSum(sum);
             dim3 threads;
             threads.x = 32;
             threads.y = 4;
@@ -685,7 +667,7 @@ namespace cv { namespace cuda { namespace device
             dim3 grid;
             grid.x = nFeatures;
 
-            icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
+            icvCalcOrientation<<<grid, threads>>>(texSum, featureX, featureY, featureSize, featureDir);
             cudaSafeCall( cudaGetLastError() );
 
             cudaSafeCall( cudaDeviceSynchronize() );
@@ -724,12 +706,14 @@ namespace cv { namespace cuda { namespace device
         {
             typedef uchar elem_type;
 
+            __device__ WinReader(cudev::TexturePtr<uchar> tex_) : tex(tex_) {};
+
             __device__ __forceinline__ uchar operator ()(int i, int j) const
             {
                 float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
                 float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
 
-                return tex2D(imgTex, pixel_x, pixel_y);
+                return tex(pixel_y, pixel_x);
             }
 
             float centerX;
@@ -739,19 +723,17 @@ namespace cv { namespace cuda { namespace device
             float sin_dir;
             int width;
             int height;
+            cudev::TexturePtr<uchar> tex;
         };
 
-        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
-                                   float& dx, float& dy);
-
-        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
+        __device__ void calc_dx_dy(cudev::TexturePtr<uchar> tex, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
                                    float& dx, float& dy)
         {
             __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
 
             dx = dy = 0.0f;
 
-            WinReader win;
+            WinReader win(tex);
 
             win.centerX = featureX[blockIdx.x];
             win.centerY = featureY[blockIdx.x];
@@ -813,14 +795,14 @@ namespace cv { namespace cuda { namespace device
             }
         }
 
-        __global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __global__ void compute_descriptors_64(cudev::TexturePtr<uchar> texImg, PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
         {
             __shared__ float smem[32 * 16];
 
             float* sRow = smem + threadIdx.y * 32;
 
             float dx, dy;
-            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
+            calc_dx_dy(texImg, featureX, featureY, featureSize, featureDir, dx, dy);
 
             float dxabs = ::fabsf(dx);
             float dyabs = ::fabsf(dy);
@@ -839,14 +821,14 @@ namespace cv { namespace cuda { namespace device
                 *descriptors_block = make_float4(dx, dy, dxabs, dyabs);
         }
 
-        __global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __global__ void compute_descriptors_128(cudev::TexturePtr<uchar> texImg, PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
         {
             __shared__ float smem[32 * 16];
 
             float* sRow = smem + threadIdx.y * 32;
 
             float dx, dy;
-            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
+            calc_dx_dy(texImg, featureX, featureY, featureSize, featureDir, dx, dy);
 
             float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
 
@@ -925,13 +907,13 @@ namespace cv { namespace cuda { namespace device
             descriptor_base[threadIdx.x] = val / s_len;
         }
 
-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
+        void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
         {
             // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
-
+            cudev::Texture<unsigned char> texImg(img);
             if (descriptors.cols == 64)
             {
-                compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(texImg, descriptors, featureX, featureY, featureSize, featureDir);
                 cudaSafeCall( cudaGetLastError() );
 
                 cudaSafeCall( cudaDeviceSynchronize() );
@@ -943,7 +925,7 @@ namespace cv { namespace cuda { namespace device
             }
             else
             {
-                compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(texImg, descriptors, featureX, featureY, featureSize, featureDir);
                 cudaSafeCall( cudaGetLastError() );
 
                 cudaSafeCall( cudaDeviceSynchronize() );
diff --git a/modules/xfeatures2d/src/surf.cuda.cpp b/modules/xfeatures2d/src/surf.cuda.cpp
index 7864a166a37..b64016b013f 100644
--- a/modules/xfeatures2d/src/surf.cuda.cpp
+++ b/modules/xfeatures2d/src/surf.cuda.cpp
@@ -94,23 +94,19 @@ namespace cv { namespace cuda { namespace device
         void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
         void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
 
-        void bindImgTex(PtrStepSzb img);
-        size_t bindSumTex(PtrStepSz<unsigned int> sum);
-        size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
-
-        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
             int octave, int nOctaveLayer);
 
-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
             int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
 
         void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
             float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
             unsigned int* featureCounter);
 
-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
+        void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
 
-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
+        void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
     }
 }}}
 
@@ -138,10 +134,7 @@ namespace
     class SURF_CUDA_Invoker
     {
     public:
-        SURF_CUDA_Invoker(cv::cuda::SURF_CUDA& surf, const GpuMat& img, const GpuMat& mask) :
-            surf_(surf),
-            img_cols(img.cols), img_rows(img.rows),
-            use_mask(!mask.empty())
+        SURF_CUDA_Invoker(cv::cuda::SURF_CUDA& surf, const GpuMat& img_, const GpuMat& mask) : surf_(surf), img(img_), img_cols(img_.cols), img_rows(img_.rows), use_mask(!mask.empty())
         {
             CV_Assert(!img.empty() && img.type() == CV_8UC1);
             CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
@@ -167,16 +160,12 @@ namespace
 
             loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
 
-            bindImgTex(img);
-
             cuda::integral(img, surf_.sum);
-            sumOffset = bindSumTex(surf_.sum);
 
             if (use_mask)
             {
                 cuda::min(mask, 1.0, surf_.mask1);
                 cuda::integral(surf_.mask1, surf_.maskSum);
-                maskOffset = bindMaskSumTex(surf_.maskSum);
             }
         }
 
@@ -195,9 +184,9 @@ namespace
                 const int layer_cols = img_cols >> octave;
                 loadOctaveConstants(octave, layer_rows, layer_cols);
 
-                icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
+                icvCalcLayerDetAndTrace_gpu(surf_.sum, surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
 
-                icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
+                icvFindMaximaInLayer_gpu(surf_.maskSum, surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
                     img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
 
                 unsigned int maxCounter;
@@ -230,7 +219,7 @@ namespace
             const int nFeatures = keypoints.cols;
             if (nFeatures > 0)
             {
-                icvCalcOrientation_gpu(keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
+                icvCalcOrientation_gpu(surf_.sum, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
                     keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures);
             }
         }
@@ -241,7 +230,7 @@ namespace
             if (nFeatures > 0)
             {
                 ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
-                compute_descriptors_gpu(descriptors, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
+                compute_descriptors_gpu(img, descriptors, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
                     keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures);
             }
         }
@@ -252,6 +241,8 @@ namespace
 
         SURF_CUDA& surf_;
 
+        GpuMat img;
+
         int img_cols, img_rows;
 
         bool use_mask;
@@ -259,9 +250,6 @@ namespace
         int maxCandidates;
         int maxFeatures;
 
-        size_t maskOffset;
-        size_t sumOffset;
-
         GpuMat counters;
     };
 }