From ca2f2b4259ca076e265f86e7bfc0e524ec7d41ee Mon Sep 17 00:00:00 2001 From: wujunkai166 Date: Wed, 22 Jun 2016 15:42:54 +0800 Subject: [PATCH] Introduce a virtual buffer pointer mapping mechanism for OCL memory object. We introduce a virtual buffer pointer mapping mechanism here to unify the CUDA code path and OCL code path into one style. Thus we can remove a lot of duplicate code in the layers implementations. --- include/caffe/blob.hpp | 6 - include/caffe/common.hpp | 6 + include/caffe/greentea/greentea_im2col.hpp | 64 - .../greentea/greentea_math_functions.hpp | 163 -- include/caffe/layer.hpp | 37 +- include/caffe/layers/base_conv_layer.hpp | 83 - include/caffe/util/cl_state.hpp | 61 + include/caffe/util/device_alternate.hpp | 2 + include/caffe/util/math_functions.hpp | 7 +- src/caffe/blob.cpp | 121 +- src/caffe/common.cpp | 4 + src/caffe/greentea/greentea_im2col.cpp | 360 +-- .../greentea/greentea_math_functions.cpp | 2511 +++++++++++------ src/caffe/greentea/libdnn.cpp | 65 +- src/caffe/layers/absval_layer.cu | 31 +- src/caffe/layers/base_conv_layer.cpp | 148 +- src/caffe/layers/base_data_layer.cu | 50 +- src/caffe/layers/batch_norm_layer.cu | 507 +--- src/caffe/layers/batch_reindex_layer.cu | 35 +- src/caffe/layers/bias_layer.cu | 49 +- src/caffe/layers/bnll_layer.cu | 22 +- src/caffe/layers/concat_layer.cu | 17 +- src/caffe/layers/contrastive_loss_layer.cu | 62 +- src/caffe/layers/conv_layer.cu | 4 +- src/caffe/layers/conv_layer_fft.cu | 26 +- src/caffe/layers/conv_layer_spatial.cu | 59 +- src/caffe/layers/crop_layer.cu | 30 +- src/caffe/layers/deconv_layer.cu | 4 +- src/caffe/layers/dropout_layer.cu | 39 +- src/caffe/layers/eltwise_layer.cu | 109 +- src/caffe/layers/elu_layer.cu | 23 +- src/caffe/layers/embed_layer.cu | 44 +- src/caffe/layers/euclidean_loss_layer.cu | 57 +- src/caffe/layers/exp_layer.cu | 56 +- src/caffe/layers/filter_layer.cu | 81 +- src/caffe/layers/im2col_layer.cu | 143 +- src/caffe/layers/inner_product_layer.cu | 87 +- src/caffe/layers/log_layer.cu | 103 +- src/caffe/layers/lrn_layer.cu | 38 +- src/caffe/layers/lstm_unit_layer.cu | 56 +- src/caffe/layers/mergecrop_layer.cu | 56 +- src/caffe/layers/mvn_layer.cu | 252 +- src/caffe/layers/pooling_layer.cu | 228 +- src/caffe/layers/power_layer.cu | 196 +- src/caffe/layers/prelu_layer.cu | 83 +- src/caffe/layers/recurrent_layer.cu | 17 +- src/caffe/layers/reduction_layer.cu | 183 +- src/caffe/layers/relu_layer.cu | 20 +- src/caffe/layers/scale_layer.cu | 296 +- .../sigmoid_cross_entropy_loss_layer.cu | 33 +- src/caffe/layers/sigmoid_layer.cu | 20 +- src/caffe/layers/silence_layer.cu | 24 +- src/caffe/layers/slice_layer.cu | 16 +- src/caffe/layers/softmax_layer.cu | 57 +- src/caffe/layers/softmax_loss_layer.cu | 122 +- src/caffe/layers/split_layer.cu | 44 +- src/caffe/layers/tanh_layer.cu | 21 +- src/caffe/layers/threshold_layer.cu | 10 +- src/caffe/layers/tile_layer.cu | 19 +- src/caffe/net.cpp | 10 - src/caffe/solvers/adadelta_solver.cu | 12 +- src/caffe/solvers/adagrad_solver.cu | 10 +- src/caffe/solvers/adam_solver.cu | 12 +- src/caffe/solvers/nesterov_solver.cu | 9 +- src/caffe/solvers/rmsprop_solver.cu | 12 +- src/caffe/solvers/sgd_solver.cpp | 81 +- src/caffe/solvers/sgd_solver.cu | 11 +- src/caffe/syncedmem.cpp | 76 +- src/caffe/test/test_math_functions.cpp | 87 +- src/caffe/test/test_ocl_kernel_compile.cpp | 2 +- .../test/test_random_number_generator.cpp | 38 +- src/caffe/test/test_syncedmem.cpp | 47 +- src/caffe/test/test_util_blas.cpp | 97 +- src/caffe/util/cl_fft.cpp | 182 +- src/caffe/util/cl_state.cpp | 192 ++ src/caffe/util/math_functions.cpp | 3 + 76 files changed, 3798 insertions(+), 4150 deletions(-) delete mode 100644 include/caffe/greentea/greentea_im2col.hpp delete mode 100644 include/caffe/greentea/greentea_math_functions.hpp create mode 100644 include/caffe/util/cl_state.hpp create mode 100644 src/caffe/util/cl_state.cpp diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index b623ed4e338..a9edaeb4671 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -10,13 +10,7 @@ #include "caffe/proto/caffe.pb.h" #include "caffe/syncedmem.hpp" -#ifdef USE_CUDA #include "caffe/util/math_functions.hpp" -#endif - -#ifdef USE_GREENTEA -#include "caffe/greentea/greentea_math_functions.hpp" -#endif const int_tp kMaxBlobAxes = 32; diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 9d7070f5df0..a0169a2d576 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -152,6 +152,9 @@ class Caffe { return Get().curand_generator64_; } #endif // USE_CUDA +#ifdef USE_GREENTEA + inline static ClState& cl_state() { return Get().cl_state_; } +#endif #if defined(USE_GREENTEA) && defined(USE_FFT) inline static ClFFTState& cl_fft_state() { return Get().cl_fft_state_; } #endif // USE_GREENTEA @@ -210,6 +213,9 @@ class Caffe { curandGenerator_t curand_generator_; curandGenerator_t curand_generator64_; #endif // USE_CUDA +#ifdef USE_GREENTEA + static ClState cl_state_; +#endif #if defined(USE_GREENTEA) && defined(USE_FFT) ClFFTState cl_fft_state_; #endif diff --git a/include/caffe/greentea/greentea_im2col.hpp b/include/caffe/greentea/greentea_im2col.hpp deleted file mode 100644 index 694cdbc256e..00000000000 --- a/include/caffe/greentea/greentea_im2col.hpp +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef GREENTEA_IM2COL_HPP_ -#define GREENTEA_IM2COL_HPP_ -#ifdef USE_GREENTEA - -#include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" -#include "viennacl/ocl/backend.hpp" -#include "viennacl/ocl/context.hpp" -#include "viennacl/ocl/device.hpp" -#include "viennacl/ocl/platform.hpp" -#include "viennacl/vector.hpp" - -namespace caffe { - -template -void greentea_im2col_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, const cl_mem data_im, - const int_tp data_offset, const int_tp channels, - const int_tp height, const int_tp width, - const int_tp kernel_h, const int_tp kernel_w, - const int_tp pad_h, const int_tp pad_w, - const int_tp stride_h, const int_tp stride_w, - const int_tp dilation_h, const int_tp dilation_w, - cl_mem data_col, const int_tp data_col_off); - -template -void greentea_col2im_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, const cl_mem data_col, - const int_tp data_col_off, const int_tp channels, - const int_tp height, const int_tp width, - const int_tp patch_h, const int_tp patch_w, - const int_tp pad_h, const int_tp pad_w, - const int_tp stride_h, const int_tp stride_w, - const int_tp dilation_h, const int_tp dilation_w, - cl_mem data_im, const int_tp data_im_off); - -template -void greentea_im2col_nd_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, cl_mem data_im, - const int_tp data_off, - const int_tp num_spatial_axes, - const int_tp channel_axis, - const int_tp num_kernels, - cl_mem im_shape, cl_mem col_shape, - cl_mem kernel_shape, cl_mem pad, cl_mem stride, - cl_mem dilation, cl_mem data_col, - const int_tp data_col_off); - -template -void greentea_col2im_nd_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, cl_mem data_col, - const int_tp data_col_off, - const int_tp num_spatial_axes, - const int_tp channel_axis, - const int_tp im_size, - cl_mem im_shape, cl_mem col_shape, - cl_mem kernel_shape, cl_mem pad, cl_mem stride, - cl_mem dilation, cl_mem data_im, - int_tp data_im_off); - -} // namespace caffe - -#endif // USE_GREENTEA -#endif /* GREENTEA_IM2COL_HPP_ */ diff --git a/include/caffe/greentea/greentea_math_functions.hpp b/include/caffe/greentea/greentea_math_functions.hpp deleted file mode 100644 index 3cfcf90777d..00000000000 --- a/include/caffe/greentea/greentea_math_functions.hpp +++ /dev/null @@ -1,163 +0,0 @@ -/* - * greentea_math_functions.hpp - * - * Created on: Apr 6, 2015 - * Author: fabian - */ - -#ifndef GREENTEA_MATH_FUNCTIONS_HPP_ -#define GREENTEA_MATH_FUNCTIONS_HPP_ - -#include "caffe/common.hpp" -#include "caffe/definitions.hpp" - -#ifdef USE_GREENTEA -#include "caffe/greentea/greentea.hpp" -#include "caffe/util/math_functions.hpp" -#include "viennacl/ocl/backend.hpp" -#include "viennacl/ocl/context.hpp" -#include "viennacl/ocl/device.hpp" -#include "viennacl/ocl/platform.hpp" -#include "viennacl/vector.hpp" - -namespace caffe { - -void greentea_memset(const int_tp ctx_id, const uint_tp N, const int_tp alpha, - cl_mem X, const int_tp offX); - -void greentea_gpu_memcpy(const uint_tp N, const cl_mem X, const int_tp offX, - void *Y, viennacl::ocl::context *ctx); - -void greentea_gpu_memcpy(const uint_tp N, const void* X, cl_mem Y, - const int_tp offY, viennacl::ocl::context *ctx); - -void greentea_gpu_memcpy(const uint_tp N, const cl_mem X, const int_tp offX, - cl_mem Y, const int_tp offY, - viennacl::ocl::context *ctx); - -template -void greentea_copy(const int_tp N, const cl_mem X, const int_tp offX, cl_mem Y, - const int_tp offY, viennacl::ocl::context *ctx); - -template -void greentea_copy(const int_tp N, const cl_mem X, const int_tp offX, Dtype* Y, - viennacl::ocl::context *ctx); - -template -void greentea_copy(const int_tp N, const Dtype* X, cl_mem Y, const int_tp offY, - viennacl::ocl::context *ctx); - -template -void greentea_gpu_gemm(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int_tp M, - const int_tp N, const int_tp K, const Dtype alpha, - const cl_mem A, const int_tp offA, const cl_mem B, - const int_tp offB, const Dtype beta, cl_mem C, - const int_tp offC); - -template -void greentea_gpu_gemv(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, - const int_tp M, const int_tp N, const Dtype alpha, - const cl_mem A, const int_tp offA, const cl_mem x, - const int_tp offx, const Dtype beta, cl_mem y, - const int_tp offy); - -template -void greentea_gpu_axpy(const int_tp ctx_id, const int_tp N, const Dtype alpha, - const cl_mem x, const int_tp offx, cl_mem y, - const int_tp offy); - -template -void greentea_gpu_mul(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); - -template -void greentea_gpu_scal(const int_tp ctx_id, const int_tp N, const Dtype alpha, - cl_mem x, int_tp offx); - -template -void greentea_gpu_axpby(const int_tp ctx_id, const int_tp N, const Dtype alpha, - const cl_mem X, const int_tp offX, const Dtype beta, - cl_mem Y, const int_tp offY); - -template -void greentea_gpu_dot(const int_tp ctx_id, const int_tp n, const cl_mem X, - const int_tp offX, const cl_mem Y, const int_tp offY, - Dtype* out); - -template -void greentea_gpu_asum(const int_tp ctx_id, const int_tp n, const cl_mem X, - const int_tp offX, Dtype* Y); - -template -void greentea_gpu_scale(const int_tp ctx_id, const int_tp n, const Dtype alpha, - const cl_mem X, const int_tp offX, cl_mem Y, - const int_tp offY); - -template -void greentea_gpu_set(const int_tp ctx_id, const int_tp N, const Dtype alpha, - cl_mem Y, const int_tp offY); - -template -void greentea_gpu_add_scalar(const int_tp ctx_id, const int_tp N, - const Dtype alpha, cl_mem Y, const int_tp offY); - -template -void greentea_gpu_add(const int_tp ctx_id, const int_tp n, const cl_mem a, - const int_tp offa, const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); - -template -void greentea_gpu_sub(const int_tp ctx_id, const int_tp n, const cl_mem a, - const int_tp offa, const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); - -template -void greentea_gpu_div(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); - -template -void greentea_gpu_abs(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, cl_mem y, const int_tp offy); - -template -void greentea_gpu_exp(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, cl_mem y, const int_tp offy); - -template -void greentea_gpu_powx(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, const Dtype alpha, cl_mem y, - const int_tp offy); - -template -void greentea_gpu_log(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, cl_mem y, const int_tp offy); - -template -void greentea_gpu_sign(const int_tp ctx_id, const int_tp n, const cl_mem x, - int_tp offx, cl_mem y, const int_tp offy); - -template -void greentea_gpu_sgnbit(const int_tp ctx_id, const int_tp n, const cl_mem x, -int_tp offx, - cl_mem y, const int_tp offy); - -template -void greentea_gpu_rng_uniform(const int_tp ctx_id, const int_tp n, - const Dtype a, const Dtype b, cl_mem r, - const int_tp offr); - -void greentea_gpu_rng_uniform(const int_tp ctx_id, const int_tp n, cl_mem r, -int_tp offr); - -template -void greentea_gpu_rng_gaussian(const int_tp ctx_id, const int_tp n, - const Dtype mu, const Dtype sigma, cl_mem r, - const int_tp offr); - -} // namespace caffe - -#endif // USE GREENTEA -#endif /* GREENTEA_MATH_FUNCTIONS_HPP_ */ diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index f1fb7cef177..92d447b15e2 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -521,35 +521,16 @@ inline Dtype Layer::Forward(const vector*>& bottom, case Caffe::GPU: Forward_gpu(bottom, top); #ifndef CPU_ONLY - if (device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - for (int_tp top_id = 0; top_id < top.size(); ++top_id) { - if (!this->loss(top_id)) { - continue; - } - const int_tp count = top[top_id]->count(); - const Dtype* data = top[top_id]->gpu_data(); - const Dtype* loss_weights = top[top_id]->gpu_diff(); - Dtype blob_loss = 0; - caffe_gpu_dot(count, data, loss_weights, &blob_loss); - loss += blob_loss; - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - for (int_tp top_id = 0; top_id < top.size(); ++top_id) { - if (!this->loss(top_id)) { - continue; - } - const int_tp count = top[top_id]->count(); - cl_mem data = (cl_mem) (top[top_id]->gpu_data()); - cl_mem loss_weights = (cl_mem) (top[top_id]->gpu_diff()); - Dtype blob_loss = 0; - greentea_gpu_dot(this->device_->id(), count, data, 0, - loss_weights, 0, &blob_loss); - loss += blob_loss; + for (int_tp top_id = 0; top_id < top.size(); ++top_id) { + if (!this->loss(top_id)) { + continue; } -#endif // USE_GREENTEA + const int_tp count = top[top_id]->count(); + const Dtype* data = top[top_id]->gpu_data(); + const Dtype* loss_weights = top[top_id]->gpu_diff(); + Dtype blob_loss = 0; + caffe_gpu_dot(count, data, loss_weights, &blob_loss); + loss += blob_loss; } #endif break; diff --git a/include/caffe/layers/base_conv_layer.hpp b/include/caffe/layers/base_conv_layer.hpp index aca544fbb7c..e0273246ef5 100644 --- a/include/caffe/layers/base_conv_layer.hpp +++ b/include/caffe/layers/base_conv_layer.hpp @@ -8,10 +8,6 @@ #include "caffe/proto/caffe.pb.h" #include "caffe/util/im2col.hpp" -#ifdef USE_GREENTEA -#include "caffe/greentea/greentea_im2col.hpp" -#endif - namespace caffe { /** @@ -143,7 +139,6 @@ class BaseConvolutionLayer : public Layer { } #ifndef CPU_ONLY -#ifdef USE_CUDA inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { if (!force_nd_im2col_ && num_spatial_axes_ == 2) { im2col_gpu(data, conv_in_channels_, @@ -175,84 +170,6 @@ class BaseConvolutionLayer : public Layer { dilation_.gpu_data(), data); } } -#endif // USE_CUDA -#ifdef USE_GREENTEA - inline void greentea_conv_im2col_gpu(const Dtype* data, const int_tp data_off, - Dtype* col_buff, - const int_tp col_buff_off) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - viennacl::ocl::program &program = this->device_->program(); - - if (!force_nd_im2col_ && num_spatial_axes_ == 2) { - greentea_im2col_gpu(&program, &ctx, (cl_mem) data, data_off, - conv_in_channels_, - conv_input_shape_.cpu_data()[1], - conv_input_shape_.cpu_data()[2], - kernel_shape_.cpu_data()[0], - kernel_shape_.cpu_data()[1], - pad_.cpu_data()[0], pad_.cpu_data()[1], - stride_.cpu_data()[0], stride_.cpu_data()[1], - dilation_.cpu_data()[0], - dilation_.cpu_data()[1], (cl_mem) col_buff, - col_buff_off); - } else { - greentea_im2col_nd_gpu(&program, &ctx, (cl_mem) data, data_off, - num_spatial_axes_, - (int_tp)0, - num_kernels_im2col_, - (cl_mem) (conv_input_shape_.gpu_data()), - (cl_mem) (col_buffer_.gpu_shape()), - (cl_mem) (kernel_shape_.gpu_data()), - (cl_mem) (pad_.gpu_data()), - (cl_mem) (stride_.gpu_data()), - (cl_mem) (dilation_.gpu_data()), - (cl_mem) col_buff, col_buff_off); - } - } - - inline void greentea_conv_col2im_gpu(const Dtype* col_buff, - const int_tp col_buff_off, Dtype* data, - const int_tp data_off) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - viennacl::ocl::program &program = this->device_->program(); - - if (!force_nd_im2col_ && num_spatial_axes_ == 2) { - greentea_col2im_gpu(&program, &ctx, - (cl_mem) col_buff, - col_buff_off, - conv_in_channels_, - conv_input_shape_.cpu_data()[1], - conv_input_shape_.cpu_data()[2], - kernel_shape_.cpu_data()[0], - kernel_shape_.cpu_data()[1], - pad_.cpu_data()[0], - pad_.cpu_data()[1], - stride_.cpu_data()[0], - stride_.cpu_data()[1], - dilation_.cpu_data()[0], - dilation_.cpu_data()[1], - (cl_mem) data, - data_off); - } else { - greentea_col2im_nd_gpu(&program, &ctx, - (cl_mem) col_buff, - col_buff_off, - num_spatial_axes_, - (int_tp)0, - num_kernels_col2im_, - (cl_mem) (conv_input_shape_.gpu_data()), - (cl_mem) (col_buffer_.gpu_shape()), - (cl_mem) (kernel_shape_.gpu_data()), - (cl_mem) (pad_.gpu_data()), - (cl_mem) (stride_.gpu_data()), - (cl_mem) (dilation_.gpu_data()), - (cl_mem) data, - data_off); - } - } -#endif // USE_GREENTEA #endif // !CPU_ONLY int_tp num_kernels_im2col_; diff --git a/include/caffe/util/cl_state.hpp b/include/caffe/util/cl_state.hpp new file mode 100644 index 00000000000..3e53146cb52 --- /dev/null +++ b/include/caffe/util/cl_state.hpp @@ -0,0 +1,61 @@ +#ifndef CAFFE_UTIL_CL_STATE_HPP +#define CAFFE_UTIL_CL_STATE_HPP +#include + +#include +#include +#include + +namespace caffe { + +/** + * @brief Virtual addressing with Opencl memory objects and their offsets. After allocating + * Opencl memory, the memory object and the memory size are mapped into a virtual + * address and the virtual address is returned. The memory object and its offset can + * easily be determined by passing a single address in order to enable the use of Caffe's + * pointer offsetting just as the approach in CUDA path. By this mechanism, many redundant + * code caused by the difference between Opencl and CUDA path can be reduced. + */ + +template +struct ClMemOff { + cl_mem memobj; + size_t offset; // offset in elements +}; + +class ClState { + public: + ClState(); + ~ClState(); + + void* create_buffer(int dev_id, cl_mem_flags flags, size_t size, + void* host_ptr, cl_int *errcode); + void destroy_buffer(void* buffer); + size_t get_buffer_size(const void* buffer); + ClMemOff get_buffer_mem(const void* ptr); + int get_mem_dev(cl_mem memobj); + + template + ClMemOff get_buffer_mem(const T* ptr) { + ClMemOff m = get_buffer_mem(static_cast(ptr)); + ClMemOff mT = {m.memobj, m.offset / sizeof (T)}; + return mT; + } + + cl_mem create_subbuffer(const void* ptr, size_t offset, cl_mem_flags flags); + template cl_mem create_subbuffer(T* ptr, int offset) { + return create_subbuffer(ptr, offset * sizeof(T), CL_MEM_READ_WRITE); + } + template cl_mem create_subbuffer(const T* ptr, int offset) { + return create_subbuffer(ptr, offset * sizeof(T), CL_MEM_READ_ONLY); + } + + private: + ClState(const ClState&); + + struct Impl; + Impl* impl_; +}; + +} // namespace caffe +#endif diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp index 893be133942..61081b6ccb6 100644 --- a/include/caffe/util/device_alternate.hpp +++ b/include/caffe/util/device_alternate.hpp @@ -105,6 +105,8 @@ inline int_tp CAFFE_GET_BLOCKS(const int_tp N) { #endif // USE_CUDA #ifdef USE_GREENTEA +#include "caffe/util/cl_state.hpp" + #define OCL_CHECK(condition) \ do { \ cl_int error = (condition); \ diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 86c9fe633c1..ee603e52782 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -145,7 +145,6 @@ void caffe_cpu_scale(const int_tp n, const Dtype alpha, const Dtype *x, Dtype* y); #ifndef CPU_ONLY // GPU -#ifdef USE_CUDA // Decaf gpu gemm provides an interface that is almost the same as the cpu // gemm function - following the c convention and calling the fortran-order @@ -174,9 +173,13 @@ void caffe_gpu_memcpy(const uint_tp N, const void *X, void *Y); template void caffe_gpu_set(const int_tp N, const Dtype alpha, Dtype *X); +#ifdef USE_CUDA inline void caffe_gpu_memset(const uint_tp N, const int_tp alpha, void* X) { CUDA_CHECK(cudaMemset(X, alpha, N)); // NOLINT(caffe/alt_fn) } +#else +void caffe_gpu_memset(const uint_tp N, const int_tp alpha, void* X); +#endif // USE_CUDA template void caffe_gpu_add_scalar(const int_tp N, const Dtype alpha, Dtype *X); @@ -249,6 +252,7 @@ template void caffe_gpu_scale(const int_tp n, const Dtype alpha, const Dtype *x, Dtype* y); +#ifdef USE_CUDA #define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \ template \ __global__ void name##_kernel(const int_tp n, const Dtype* x, Dtype* y) { \ @@ -268,7 +272,6 @@ void caffe_gpu_##name(const int_tp n, const double* x, double* y) { \ name##_kernel<<>>( \ n, x, y); \ } - #endif // USE_CUDA #endif // !CPU_ONLY diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index 19dbd9c43bf..0f6031bc295 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -10,7 +10,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -186,19 +185,9 @@ void Blob::Update() { case SyncedMemory::SYNCED: { #ifndef CPU_ONLY // perform computation on GPU - if (device_->backend() == Backend::BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_axpy(count_, Dtype(-1), - static_cast(diff_->gpu_data()), - static_cast(data_->mutable_gpu_data())); -#endif - } else { -#ifdef USE_GREENTEA - greentea_gpu_axpy(device_->id(), count_, Dtype(-1), - (cl_mem) (diff_->gpu_data()), 0, - (cl_mem) (data_->mutable_gpu_data()), 0); -#endif - } + caffe_gpu_axpy(count_, Dtype(-1), + static_cast(diff_->gpu_data()), + static_cast(data_->mutable_gpu_data())); #else NO_GPU; #endif @@ -235,20 +224,9 @@ Dtype Blob::asum_data() const { case SyncedMemory::HEAD_AT_GPU: case SyncedMemory::SYNCED: { #ifndef CPU_ONLY - if (device_->backend() == Backend::BACKEND_CUDA) { -#ifdef USE_CUDA - Dtype asum; - caffe_gpu_asum(count_, gpu_data(), &asum); - return asum; -#endif - } else { -#ifdef USE_GREENTEA - Dtype asum; - greentea_gpu_asum(device_->id(), count_, (cl_mem) gpu_data(), 0, - &asum); - return asum; -#endif - } + Dtype asum; + caffe_gpu_asum(count_, gpu_data(), &asum); + return asum; #else NO_GPU; #endif @@ -282,20 +260,9 @@ Dtype Blob::asum_diff() const { case SyncedMemory::HEAD_AT_GPU: case SyncedMemory::SYNCED: { #ifndef CPU_ONLY - if (device_->backend() == Backend::BACKEND_CUDA) { -#ifdef USE_CUDA - Dtype asum; - caffe_gpu_asum(count_, gpu_diff(), &asum); - return asum; -#endif - } else { -#ifdef USE_GREENTEA - Dtype asum; - greentea_gpu_asum(device_->id(), count_, (cl_mem) gpu_diff(), 0, - &asum); - return asum; -#endif - } + Dtype asum; + caffe_gpu_asum(count_, gpu_diff(), &asum); + return asum; #else NO_GPU; #endif @@ -335,16 +302,7 @@ Dtype Blob::sumsq_data() const { case SyncedMemory::SYNCED: { #ifndef CPU_ONLY data = gpu_data(); - if (device_->backend() == Backend::BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_dot(count_, data, data, &sumsq); -#endif - } else { -#ifdef USE_GREENTEA - greentea_gpu_dot(device_->id(), count_, (cl_mem) data, 0, - (cl_mem) data, 0, &sumsq); -#endif - } + caffe_gpu_dot(count_, data, data, &sumsq); #else NO_GPU; #endif @@ -385,16 +343,7 @@ Dtype Blob::sumsq_diff() const { case SyncedMemory::SYNCED: { #ifndef CPU_ONLY diff = gpu_diff(); - if (device_->backend() == Backend::BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_dot(count_, diff, diff, &sumsq); -#endif - } else { -#ifdef USE_GREENTEA - greentea_gpu_dot(device_->id(), count_, (cl_mem) diff, 0, - (cl_mem) diff, 0, &sumsq); -#endif - } + caffe_gpu_dot(count_, diff, diff, &sumsq); #else NO_GPU; #endif @@ -432,16 +381,7 @@ void Blob::scale_data(Dtype scale_factor) { case SyncedMemory::SYNCED: { #ifndef CPU_ONLY data = mutable_gpu_data(); - if (device_->backend() == Backend::BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_scal(count_, scale_factor, data); -#endif - } else { -#ifdef USE_GREENTEA - greentea_gpu_scal(device_->id(), count_, scale_factor, - (cl_mem) data, 0); -#endif - } + caffe_gpu_scal(count_, scale_factor, data); return; #else NO_GPU; @@ -478,16 +418,7 @@ void Blob::scale_diff(Dtype scale_factor) { case SyncedMemory::SYNCED: { #ifndef CPU_ONLY diff = mutable_gpu_diff(); - if (device_->backend() == Backend::BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_scal(count_, scale_factor, diff); -#endif - } else { -#ifdef USE_GREENTEA - greentea_gpu_scal(device_->id(), count_, scale_factor, - (cl_mem) diff, 0); -#endif - } + caffe_gpu_scal(count_, scale_factor, diff); return; #else NO_GPU; @@ -533,28 +464,12 @@ void Blob::CopyFrom(const Blob& source, bool copy_diff, bool reshape) { } switch (Caffe::mode()) { case Caffe::GPU: { - if (device_->backend() == BACKEND_CUDA) { - if (copy_diff) { - caffe_copy(count_, source.gpu_diff(), - static_cast(diff_->mutable_gpu_data())); - } else { - caffe_copy(count_, source.gpu_data(), - static_cast(data_->mutable_gpu_data())); - } + if (copy_diff) { + caffe_copy(count_, source.gpu_diff(), + static_cast(diff_->mutable_gpu_data())); } else { -#ifdef USE_GREENTEA - if (copy_diff) { - greentea_copy( - count_, (cl_mem) (source.gpu_diff()), 0, - (cl_mem) (diff_->mutable_gpu_data()), 0, - &viennacl::ocl::get_context(device_->id())); - } else { - greentea_copy( - count_, (cl_mem) (source.gpu_data()), 0, - (cl_mem) (data_->mutable_gpu_data()), 0, - &viennacl::ocl::get_context(device_->id())); - } -#endif + caffe_copy(count_, source.gpu_data(), + static_cast(data_->mutable_gpu_data())); } break; } diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index 0f1d12422d9..fc5111bd89c 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -27,6 +27,10 @@ namespace caffe { // Make sure each thread can have different values. static boost::thread_specific_ptr thread_instance_; +#ifdef USE_GREENTEA +ClState Caffe::cl_state_; +#endif + // Pointer to the global instance of Caffe static Caffe* global_instance_; static std::atomic first(true); diff --git a/src/caffe/greentea/greentea_im2col.cpp b/src/caffe/greentea/greentea_im2col.cpp index 11a0e59ee3f..ba0d8e0c8d3 100644 --- a/src/caffe/greentea/greentea_im2col.cpp +++ b/src/caffe/greentea/greentea_im2col.cpp @@ -5,42 +5,50 @@ * Author: Fabian Tschopp */ #include "caffe/common.hpp" +#include "caffe/device.hpp" #ifdef USE_GREENTEA -#include "caffe/greentea/greentea_im2col.hpp" +#include "caffe/util/im2col.hpp" namespace caffe { - template -void greentea_im2col_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, const cl_mem data_im, - const int_tp data_offset, const int_tp channels, - const int_tp height, const int_tp width, - const int_tp kernel_h, const int_tp kernel_w, - const int_tp pad_h, const int_tp pad_w, - const int_tp stride_h, const int_tp stride_w, - const int_tp dilation_h, const int_tp dilation_w, - cl_mem data_col, const int_tp data_col_off) { +void im2col_gpu(const Dtype *data_im, + const int_tp channels, const int_tp height, + const int_tp width, const int_tp kernel_h, + const int_tp kernel_w, const int_tp pad_h, + const int_tp pad_w, const int_tp stride_h, + const int_tp stride_w, const int_tp dilation_h, + const int_tp dilation_w, Dtype *data_col) { int_tp height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; int_tp width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; int_tp num_kernels = channels * height_col * width_col; - viennacl::ocl::kernel &kernel = prog->get_kernel(CL_KERNEL_SELECT("im2col")); + ClState& clState = Caffe::cl_state(); + + ClMemOff buf_data_im = clState.get_buffer_mem(data_im); + ClMemOff buf_data_col = clState.get_buffer_mem(data_col); + + int dev_id = clState.get_mem_dev(buf_data_im.memobj); + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &prog = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + int offset_data_im = buf_data_im.offset; + int offset_data_col = buf_data_col.offset; + + viennacl::ocl::kernel &kernel = prog.get_kernel(CL_KERNEL_SELECT("im2col")); viennacl::ocl::enqueue( - kernel(num_kernels, WrapHandle(data_im, ctx), data_offset, height, width, - kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, - dilation_w, height_col, width_col, WrapHandle(data_col, ctx), - data_col_off), - ctx->get_queue()); + kernel(num_kernels, WrapHandle(buf_data_im.memobj, &ctx), offset_data_im, + height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, + stride_w, dilation_h, dilation_w, height_col, width_col, + WrapHandle(buf_data_col.memobj, &ctx), offset_data_col), + ctx.get_queue()); } // Explicit instantiation -template void greentea_im2col_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, - const cl_mem data_im, - const int_tp data_offset, +template void im2col_gpu(const float *data_im, const int_tp channels, const int_tp height, const int_tp width, @@ -51,56 +59,59 @@ template void greentea_im2col_gpu(viennacl::ocl::program *prog, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, - cl_mem data_col, - const int_tp data_col_off); - -template void greentea_im2col_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, - const cl_mem data_im, - const int_tp data_offset, - const int_tp channels, - const int_tp height, - const int_tp width, - const int_tp kernel_h, - const int_tp kernel_w, - const int_tp pad_h, - const int_tp pad_w, - const int_tp stride_h, - const int_tp stride_w, - const int_tp dilation_h, - const int_tp dilation_w, - cl_mem data_col, - const int_tp data_col_off); + float *data_col); + +template void im2col_gpu(const double *data_im, + const int_tp channels, + const int_tp height, + const int_tp width, + const int_tp kernel_h, + const int_tp kernel_w, + const int_tp pad_h, const int_tp pad_w, + const int_tp stride_h, + const int_tp stride_w, + const int_tp dilation_h, + const int_tp dilation_w, + double *data_col); template -void greentea_col2im_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, const cl_mem data_col, - const int_tp data_col_off, const int_tp channels, - const int_tp height, const int_tp width, - const int_tp kernel_h, const int_tp kernel_w, - const int_tp pad_h, const int_tp pad_w, - const int_tp stride_h, const int_tp stride_w, - const int_tp dilation_h, const int_tp dilation_w, - cl_mem data_im, const int_tp data_offset) { +void col2im_gpu(const Dtype *data_col, const int_tp channels, + const int_tp height, const int_tp width, + const int_tp kernel_h, const int_tp kernel_w, + const int_tp pad_h, const int_tp pad_w, + const int_tp stride_h, const int_tp stride_w, + const int_tp dilation_h, const int_tp dilation_w, + Dtype* data_im) { int_tp height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) - / stride_h + 1; + / stride_h + 1; int_tp width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) - / stride_w + 1; + / stride_w + 1; int_tp num_kernels = channels * height * width; - viennacl::ocl::kernel &kernel = prog->get_kernel(CL_KERNEL_SELECT("col2im")); + + ClState& clState = Caffe::cl_state(); + + ClMemOff buf_data_im = clState.get_buffer_mem(data_im); + ClMemOff buf_data_col = clState.get_buffer_mem(data_col); + + int dev_id = clState.get_mem_dev(buf_data_im.memobj); + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &prog = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + int offset_data_im = buf_data_im.offset; + int offset_data_col = buf_data_col.offset; + + viennacl::ocl::kernel &kernel = prog.get_kernel(CL_KERNEL_SELECT("col2im")); viennacl::ocl::enqueue( - kernel(num_kernels, WrapHandle(data_col, ctx), data_col_off, height, - width, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, - stride_w, dilation_h, dilation_w, height_col, width_col, - WrapHandle(data_im, ctx), data_offset), - ctx->get_queue()); + kernel(num_kernels, WrapHandle(buf_data_col.memobj, &ctx), offset_data_col, + height, width, channels, kernel_h, kernel_w, pad_h, pad_w, + stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, + WrapHandle(buf_data_im.memobj, &ctx), offset_data_im), + ctx.get_queue()); } -template void greentea_col2im_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, - const cl_mem data_col, - const int_tp data_col_off, +template void col2im_gpu(const float *data_col, const int_tp channels, const int_tp height, const int_tp width, @@ -111,125 +122,140 @@ template void greentea_col2im_gpu(viennacl::ocl::program *prog, const int_tp stride_w, const int_tp dilation_h, const int_tp dilation_w, - cl_mem data_im, - const int_tp data_offset); - -template void greentea_col2im_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, - const cl_mem data_col, - const int_tp data_col_off, - const int_tp channels, - const int_tp height, - const int_tp width, - const int_tp patch_h, - const int_tp patch_w, - const int_tp pad_h, - const int_tp pad_w, - const int_tp stride_h, - const int_tp stride_w, - const int_tp dilation_h, - const int_tp dilation_w, - cl_mem data_im, - const int_tp data_offset); + float *data_im); + +template void col2im_gpu(const double *data_col, + const int_tp channels, + const int_tp height, + const int_tp width, + const int_tp patch_h, + const int_tp patch_w, + const int_tp pad_h, const int_tp pad_w, + const int_tp stride_h, + const int_tp stride_w, + const int_tp dilation_h, + const int_tp dilation_w, + double *data_im); template -void greentea_im2col_nd_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, cl_mem data_im, - const int_tp data_off, - const int_tp num_spatial_axes, - const int_tp channel_axis, const int_tp num_kernels, - cl_mem im_shape, cl_mem col_shape, - cl_mem kernel_shape, cl_mem pad, cl_mem stride, - cl_mem dilation, cl_mem data_col, - const int_tp data_col_off) { - viennacl::ocl::kernel &kernel = prog->get_kernel( +void im2col_nd_gpu(const Dtype* data_im, const int_tp num_spatial_axes, + const int_tp num_kernels, const int_tp* im_shape, + const int_tp* col_shape, const int_tp* kernel_shape, + const int_tp* pad, const int_tp* stride, + const int_tp* dilation, Dtype* data_col) { + ClState& clState = Caffe::cl_state(); + ClMemOff buf_data_im = clState.get_buffer_mem(data_im); + ClMemOff buf_im_shape = clState.get_buffer_mem(im_shape); + ClMemOff buf_col_shape = clState.get_buffer_mem(col_shape); + ClMemOff buf_kernel_shape = clState.get_buffer_mem(kernel_shape); + ClMemOff buf_pad = clState.get_buffer_mem(pad); + ClMemOff buf_stride = clState.get_buffer_mem(stride); + ClMemOff buf_dilation = clState.get_buffer_mem(dilation); + ClMemOff buf_data_col = clState.get_buffer_mem(data_col); + int dev_id = clState.get_mem_dev(buf_data_im.memobj); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &prog = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + viennacl::ocl::kernel &kernel = prog.get_kernel( CL_KERNEL_SELECT("im2col_nd")); viennacl::ocl::enqueue( - kernel(num_kernels, num_spatial_axes, channel_axis, - WrapHandle(data_im, ctx), data_off, WrapHandle(im_shape, ctx), - WrapHandle(col_shape, ctx), WrapHandle(kernel_shape, ctx), - WrapHandle(pad, ctx), WrapHandle(stride, ctx), - WrapHandle(dilation, ctx), WrapHandle(data_col, ctx), - data_col_off), - ctx->get_queue()); + kernel(num_kernels, num_spatial_axes, (int_tp)(buf_im_shape.offset), + WrapHandle(buf_data_im.memobj, &ctx), (int_tp)buf_data_im.offset, + WrapHandle(buf_im_shape.memobj, &ctx), + WrapHandle(buf_col_shape.memobj, &ctx), + WrapHandle(buf_kernel_shape.memobj, &ctx), + WrapHandle(buf_pad.memobj, &ctx), + WrapHandle(buf_stride.memobj, &ctx), + WrapHandle(buf_dilation.memobj, &ctx), + WrapHandle(buf_data_col.memobj, &ctx), + (int_tp)buf_data_col.offset), + ctx.get_queue()); } -// Explicit instantiation -template void greentea_im2col_nd_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, - cl_mem data_im, - const int_tp data_off, - const int_tp num_spatial_axes, - const int_tp channel_axis, - const int_tp num_kernels, - cl_mem im_shape, cl_mem col_shape, - cl_mem kernel_shape, cl_mem pad, - cl_mem stride, cl_mem dilation, - cl_mem data_col, - const int_tp data_col_off); - -template void greentea_im2col_nd_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, - cl_mem data_im, - const int_tp data_off, - const int_tp num_spatial_axes, - const int_tp channel_axis, - const int_tp num_kernels, - cl_mem im_shape, cl_mem col_shape, - cl_mem kernel_shape, cl_mem pad, - cl_mem stride, cl_mem dilation, - cl_mem data_col, - const int_tp data_col_off); +template void im2col_nd_gpu(const float* data_im, + const int_tp num_spatial_axes, + const int_tp num_kernels, + const int_tp* im_shape, + const int_tp* col_shape, + const int_tp* kernel_shape, + const int_tp* pad, + const int_tp* stride, + const int_tp* dilation, + float* data_col); + +template void im2col_nd_gpu(const double* data_im, + const int_tp num_spatial_axes, + const int_tp num_kernels, + const int_tp* im_shape, + const int_tp* col_shape, + const int_tp* kernel_shape, + const int_tp* pad, + const int_tp* stride, + const int_tp* dilation, + double* data_col); template -void greentea_col2im_nd_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, cl_mem data_col, - const int_tp data_col_off, - const int_tp num_spatial_axes, - const int_tp channel_axis, const int_tp im_size, - cl_mem im_shape, cl_mem col_shape, - cl_mem kernel_shape, cl_mem pad, cl_mem stride, - cl_mem dilation, cl_mem data_im, - const int_tp data_im_off) { - viennacl::ocl::kernel &kernel = prog->get_kernel( +void col2im_nd_gpu(const Dtype* data_col, const int_tp num_spatial_axes, + const int_tp im_size, const int_tp* im_shape, + const int_tp* col_shape, const int_tp* kernel_shape, + const int_tp* pad, const int_tp* stride, + const int_tp* dilation, Dtype* data_im) { + ClState& clState = Caffe::cl_state(); + ClMemOff buf_data_col = clState.get_buffer_mem(data_col); + ClMemOff buf_im_shape = clState.get_buffer_mem(im_shape); + ClMemOff buf_col_shape = clState.get_buffer_mem(col_shape); + ClMemOff buf_kernel_shape = clState.get_buffer_mem(kernel_shape); + ClMemOff buf_pad = clState.get_buffer_mem(pad); + ClMemOff buf_stride = clState.get_buffer_mem(stride); + ClMemOff buf_dilation = clState.get_buffer_mem(dilation); + ClMemOff buf_data_im = clState.get_buffer_mem(data_im); + + int dev_id = clState.get_mem_dev(buf_data_col.memobj); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &prog = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &kernel = prog.get_kernel( CL_KERNEL_SELECT("col2im_nd")); viennacl::ocl::enqueue( - kernel(im_size, num_spatial_axes, channel_axis, - WrapHandle(data_col, ctx), data_col_off, - WrapHandle(im_shape, ctx), - WrapHandle(col_shape, ctx), - WrapHandle(kernel_shape, ctx), WrapHandle(pad, ctx), - WrapHandle(stride, ctx), WrapHandle(dilation, ctx), - WrapHandle(data_im, ctx), data_im_off), - ctx->get_queue()); + kernel(im_size, num_spatial_axes, (int_tp)(buf_im_shape.offset), + WrapHandle(buf_data_col.memobj, &ctx), (int_tp)buf_data_col.offset, + WrapHandle(buf_im_shape.memobj, &ctx), + WrapHandle(buf_col_shape.memobj, &ctx), + WrapHandle(buf_kernel_shape.memobj, &ctx), + WrapHandle(buf_pad.memobj, &ctx), + WrapHandle(buf_stride.memobj, &ctx), + WrapHandle(buf_dilation.memobj, &ctx), + WrapHandle(buf_data_im.memobj, &ctx), + (int_tp)buf_data_im.offset), + ctx.get_queue()); } -// Explicit instantiation -template void greentea_col2im_nd_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, - cl_mem data_col, - const int_tp data_col_off, - const int_tp num_spatial_axes, - const int_tp channel_axis, - const int_tp im_size, - cl_mem im_shape, cl_mem col_shape, - cl_mem kernel_shape, cl_mem pad, - cl_mem stride, cl_mem dilation, - cl_mem data_im, int_tp data_off); - -template void greentea_col2im_nd_gpu(viennacl::ocl::program *prog, - viennacl::ocl::context *ctx, - cl_mem data_col, - const int_tp data_col_off, - const int_tp num_spatial_axes, - const int_tp channel_axis, - const int_tp im_size, - cl_mem im_shape, cl_mem col_shape, - cl_mem kernel_shape, cl_mem pad, - cl_mem stride, cl_mem dilation, - cl_mem data_im, int_tp data_off); +template void col2im_nd_gpu(const float* data_col, + const int_tp num_spatial_axes, + const int_tp im_size, + const int_tp* im_shape, + const int_tp* col_shape, + const int_tp* kernel_shape, + const int_tp* pad, + const int_tp* stride, + const int_tp* dilation, + float* data_im); + +template void col2im_nd_gpu(const double* data_col, + const int_tp num_spatial_axes, + const int_tp im_size, + const int_tp* im_shape, + const int_tp* col_shape, + const int_tp* kernel_shape, + const int_tp* pad, + const int_tp* stride, + const int_tp* dilation, + double* data_im); } // namespace caffe #endif diff --git a/src/caffe/greentea/greentea_math_functions.cpp b/src/caffe/greentea/greentea_math_functions.cpp index b38ee5e45f8..c0874463dcd 100644 --- a/src/caffe/greentea/greentea_math_functions.cpp +++ b/src/caffe/greentea/greentea_math_functions.cpp @@ -9,9 +9,6 @@ #include "caffe/device.hpp" #ifdef USE_GREENTEA -#include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" - #include #include @@ -24,14 +21,15 @@ #include #include +#include "caffe/greentea/greentea.hpp" +#include "caffe/util/math_functions.hpp" + #include "viennacl/backend/opencl.hpp" #include "viennacl/ocl/backend.hpp" #include "viennacl/ocl/context.hpp" #include "viennacl/ocl/device.hpp" #include "viennacl/ocl/platform.hpp" -#include "caffe/util/math_functions.hpp" - #if defined(USE_CLBLAS) #include // NOLINT #elif defined(USE_CLBLAST) @@ -62,10 +60,18 @@ namespace caffe { -void greentea_memset(const int_tp ctx_id, const uint_tp N, const int_tp alpha, - cl_mem X, const int_tp offX) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) +void caffe_gpu_memset(const uint_tp N, const int_tp alpha, void* X) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(X); + + cl_mem Mem_X = bufX.memobj; + + int offX = bufX.offset; + + int dev_id = clState.get_mem_dev(Mem_X); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); // OpenCL Version >= 1.2 approach @@ -78,132 +84,198 @@ void greentea_memset(const int_tp ctx_id, const uint_tp N, const int_tp alpha, CL_KERNEL_SELECT("fillbuffer")); viennacl::ocl::enqueue( oclk_fill(static_cast(N), static_cast(alpha), - WrapHandle(X, &ctx), offX), + WrapHandle(Mem_X, &ctx), offX), ctx.get_queue()); } -// Copy from OpenCL buffer to main memory -void greentea_gpu_memcpy(const uint_tp N, const cl_mem X, const int_tp offX, - void *Y, viennacl::ocl::context *ctx) { - if (Y != NULL) { - clEnqueueReadBuffer(ctx->get_queue().handle().get(), X, CL_TRUE, offX, N, Y, - 0, - NULL, - NULL); - } -} +template<> +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int_tp M, + const int_tp N, const int_tp K, const float alpha, + const float* A, const float* B, const float beta, + float* C) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufA = clState.get_buffer_mem(A); + ClMemOff bufB = clState.get_buffer_mem(B); + ClMemOff bufC = clState.get_buffer_mem(C); -// Copy from main memory to OpenCL buffer -void greentea_gpu_memcpy(const uint_tp N, const void* X, cl_mem Y, - const int_tp offY, viennacl::ocl::context *ctx) { - if (X != NULL) { - clEnqueueWriteBuffer(ctx->get_queue().handle().get(), Y, - CL_TRUE, - offY, N, X, 0, NULL, NULL); - } -} + cl_mem Mem_A = bufA.memobj; + cl_mem Mem_B = bufB.memobj; + cl_mem Mem_C = bufC.memobj; -// Copy from OpenCL to OpenCL buffer -void greentea_gpu_memcpy(const uint_tp N, const cl_mem X, const int_tp offX, - cl_mem Y, const int_tp offY, - viennacl::ocl::context *ctx) { - clEnqueueCopyBuffer(ctx->get_queue().handle().get(), X, Y, offX, offY, N, 0, - NULL, - NULL); -} + int offA = bufA.offset; + int offB = bufB.offset; + int offC = bufC.offset; -template -void greentea_copy(const int_tp N, const cl_mem X, const int_tp offX, Dtype* Y, - viennacl::ocl::context *ctx) { - greentea_gpu_memcpy(sizeof(Dtype) * N, X, offX * sizeof(Dtype), Y, ctx); -} + int dev_id = clState.get_mem_dev(Mem_A); -template -void greentea_copy(const int_tp N, const Dtype* X, cl_mem Y, const int_tp offY, - viennacl::ocl::context *ctx) { - greentea_gpu_memcpy(sizeof(Dtype) * N, X, Y, offY * sizeof(Dtype), ctx); -} + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); -// Copy from OpenCL buffer to OpenCL buffer -template -void greentea_copy(const int_tp N, const cl_mem X, const int_tp offX, cl_mem Y, - const int_tp offY, viennacl::ocl::context *ctx) { - greentea_gpu_memcpy(sizeof(Dtype) * N, X, offX * sizeof(Dtype), Y, - offY * sizeof(Dtype), ctx); + if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { + float* Aptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_A, true, CL_MAP_READ, + sizeof(float) * offA, sizeof(float) * M * K, 0, NULL, NULL, NULL)); + float* Bptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_B, true, CL_MAP_READ, + sizeof(float) * offB, sizeof(float) * N * K, 0, NULL, NULL, NULL)); + float* Cptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_C, true, CL_MAP_READ | CL_MAP_WRITE, + sizeof(float) * offC, sizeof(float) * M * N, 0, NULL, NULL, NULL)); + + caffe_cpu_gemm(TransA, TransB, M, N, K, alpha, Aptr, Bptr, beta, + Cptr); + + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_A, Aptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_B, Bptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_C, Cptr, + 0, NULL, NULL); + } else { + int_tp lda = (TransA == CblasNoTrans) ? K : M; + int_tp ldb = (TransB == CblasNoTrans) ? N : K; + int_tp ldc = N; + +#if defined(USE_CLBLAS) + + clblasOrder clOrder = clblasRowMajor; + clblasTranspose clTransA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose clTransB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + + cl_command_queue queue = ctx.get_queue().handle().get(); + + GREENTEA_CL_BLAS_CHECK( + clblasSgemm(clOrder, clTransA, clTransB, + M, N, K, alpha, Mem_A, offA, lda, Mem_B, offB, ldb, beta, + Mem_C, offC, ldc, 1, &queue, 0, NULL, NULL)); + +#elif defined(USE_CLBLAST) + + cl_command_queue queue = ctx.get_queue().handle().get(); + + clblast::Layout layout = clblast::Layout::kRowMajor; + clblast::Transpose a_transpose = (TransA == CblasNoTrans) ? + clblast::Transpose::kNo : clblast::Transpose::kYes; + clblast::Transpose b_transpose = (TransB == CblasNoTrans) ? + clblast::Transpose::kNo : clblast::Transpose::kYes; + + GREENTEA_CLBLAST_CHECK( + clblast::Gemm( + layout, a_transpose, b_transpose, + M, N, K, + alpha, + Mem_A, offA, lda, + Mem_B, offB, ldb, + beta, + Mem_C, offC, ldc, + &queue)); + +#else // default (ViennaCL) + + typedef typename viennacl::matrix_base::size_type size_type; + typedef typename viennacl::matrix_base::size_type difference_type; + + size_type A_size1 = static_cast((TransA == CblasTrans) ? K : M); + size_type A_size2 = static_cast((TransA == CblasTrans) ? M : K); + + size_type B_size1 = static_cast((TransB == CblasTrans) ? N : K); + size_type B_size2 = static_cast((TransB == CblasTrans) ? K : N); + + viennacl::matrix_base matA(Mem_A, ctx, + A_size1, + size_type(0), + difference_type(1), + size_type(M), + A_size2, + size_type(offA), + difference_type(1), + size_type(lda) + VCL_ROW_MAJOR); + + viennacl::matrix_base matB(Mem_B, ctx, + B_size1, + size_type(0), + difference_type(1), + size_type(K), B_size2, + size_type(offB), + difference_type(1), + size_type(ldb) + VCL_ROW_MAJOR); + + viennacl::matrix_base matC(Mem_C, ctx, + size_type(M), + size_type(0), + difference_type(1), + size_type(M), + size_type(N), + size_type(offC), + difference_type(1), + size_type(ldc) + VCL_ROW_MAJOR); + + if (TransA == CblasTrans && TransB == CblasTrans) + viennacl::linalg::prod_impl(viennacl::trans(matA), viennacl::trans(matB), + matC, alpha, beta); + else if (TransA == CblasTrans && TransB == CblasNoTrans) + viennacl::linalg::prod_impl(viennacl::trans(matA), matB, matC, alpha, + beta); + else if (TransA == CblasNoTrans && TransB == CblasTrans) + viennacl::linalg::prod_impl(matA, viennacl::trans(matB), matC, alpha, + beta); + else if (TransA == CblasNoTrans && TransB == CblasNoTrans) + viennacl::linalg::prod_impl(matA, matB, matC, alpha, beta); + +#endif // clBLAS, CLBlast, or default (ViennaCL) + } } -// Explicit instantiations -template void greentea_copy(const int_tp N, const cl_mem X, - const int_tp offX, - int_tp* Y, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const cl_mem X, - const int_tp offX, uint_tp* Y, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const cl_mem X, - const int_tp offX, float* Y, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const cl_mem X, - const int_tp offX, double* Y, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const int_tp* X, cl_mem Y, - const int_tp offY, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const uint_tp* X, cl_mem Y, - const int_tp offY, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const float* X, cl_mem Y, - const int_tp offY, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const double* X, cl_mem Y, - const int_tp offY, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const cl_mem X, - const int_tp offX, cl_mem Y, - const int_tp offY, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const cl_mem X, - const int_tp offX, cl_mem Y, - const int_tp offY, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const cl_mem X, - const int_tp offX, cl_mem Y, - const int_tp offY, - viennacl::ocl::context *ctx); -template void greentea_copy(const int_tp N, const cl_mem X, - const int_tp offX, cl_mem Y, - const int_tp offY, - viennacl::ocl::context *ctx); +template<> +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int_tp M, + const int_tp N, const int_tp K, const double alpha, + const double* A, const double* B, const double beta, + double* C) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufA = clState.get_buffer_mem(A); + ClMemOff bufB = clState.get_buffer_mem(B); + ClMemOff bufC = clState.get_buffer_mem(C); -template -void greentea_gpu_gemm(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int_tp M, - const int_tp N, const int_tp K, const Dtype alpha, - const cl_mem A, const int_tp offA, const cl_mem B, - const int_tp offB, const Dtype beta, cl_mem C, - const int_tp offC) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); + cl_mem Mem_A = bufA.memobj; + cl_mem Mem_B = bufB.memobj; + cl_mem Mem_C = bufC.memobj; + + int offA = bufA.offset; + int offB = bufB.offset; + int offC = bufC.offset; + + int dev_id = clState.get_mem_dev(Mem_A); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { - Dtype* Aptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), A, true, CL_MAP_READ, - sizeof(Dtype) * offA, sizeof(Dtype) * M * K, 0, NULL, NULL, NULL)); - Dtype* Bptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), B, true, CL_MAP_READ, - sizeof(Dtype) * offB, sizeof(Dtype) * N * K, 0, NULL, NULL, NULL)); - Dtype* Cptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), C, true, CL_MAP_READ | CL_MAP_WRITE, - sizeof(Dtype) * offC, sizeof(Dtype) * M * N, 0, NULL, NULL, NULL)); - - caffe_cpu_gemm(TransA, TransB, M, N, K, alpha, Aptr, Bptr, beta, + double* Aptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_A, true, CL_MAP_READ, + sizeof(double) * offA, sizeof(double) * M * K, 0, NULL, NULL, NULL)); + double* Bptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_B, true, CL_MAP_READ, + sizeof(double) * offB, sizeof(double) * N * K, 0, NULL, NULL, NULL)); + double* Cptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_C, true, CL_MAP_READ | CL_MAP_WRITE, + sizeof(double) * offC, sizeof(double) * M * N, 0, NULL, NULL, NULL)); + + caffe_cpu_gemm(TransA, TransB, M, N, K, alpha, Aptr, Bptr, beta, Cptr); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), A, Aptr, 0, NULL, - NULL); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), B, Bptr, 0, NULL, - NULL); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), C, Cptr, 0, NULL, - NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_A, Aptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_B, Bptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_C, Cptr, + 0, NULL, NULL); } else { int_tp lda = (TransA == CblasNoTrans) ? K : M; int_tp ldb = (TransB == CblasNoTrans) ? N : K; @@ -219,17 +291,10 @@ void greentea_gpu_gemm(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, cl_command_queue queue = ctx.get_queue().handle().get(); - if (std::is_same::value) { - GREENTEA_CL_BLAS_CHECK( - clblasSgemm(clOrder, clTransA, clTransB, - M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, - C, offC, ldc, 1, &queue, 0, NULL, NULL)); - } else { - GREENTEA_CL_BLAS_CHECK( - clblasDgemm(clOrder, clTransA, clTransB, - M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, - C, offC, ldc, 1, &queue, 0, NULL, NULL)); - } + GREENTEA_CL_BLAS_CHECK( + clblasDgemm(clOrder, clTransA, clTransB, + M, N, K, alpha, Mem_A, offA, lda, Mem_B, offB, ldb, beta, + Mem_C, offC, ldc, 1, &queue, 0, NULL, NULL)); #elif defined(USE_CLBLAST) @@ -241,35 +306,22 @@ void greentea_gpu_gemm(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, clblast::Transpose b_transpose = (TransB == CblasNoTrans) ? clblast::Transpose::kNo : clblast::Transpose::kYes; - if (std::is_same::value) { - GREENTEA_CLBLAST_CHECK( - clblast::Gemm( - layout, a_transpose, b_transpose, - M, N, K, - alpha, - A, offA, lda, - B, offB, ldb, - beta, - C, offC, ldc, - &queue)); - } else { - GREENTEA_CLBLAST_CHECK( - clblast::Gemm( - layout, a_transpose, b_transpose, - M, N, K, - alpha, - A, offA, lda, - B, offB, ldb, - beta, - C, offC, ldc, - &queue)); - } + GREENTEA_CLBLAST_CHECK( + clblast::Gemm( + layout, a_transpose, b_transpose, + M, N, K, + alpha, + Mem_A, offA, lda, + Mem_B, offB, ldb, + beta, + Mem_C, offC, ldc, + &queue)); #else // default (ViennaCL) - typedef typename viennacl::matrix_base::size_type size_type; - typedef typename viennacl::matrix_base::size_type difference_type; size_type A_size1 = static_cast((TransA == CblasTrans) ? K : M); @@ -278,33 +330,38 @@ void greentea_gpu_gemm(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, size_type B_size1 = static_cast((TransB == CblasTrans) ? N : K); size_type B_size2 = static_cast((TransB == CblasTrans) ? K : N); - viennacl::matrix_base matA(A, ctx, A_size1, - size_type(0), - difference_type(1), - size_type(M), A_size2, - size_type(offA), - difference_type(1), - size_type(lda) - VCL_ROW_MAJOR); - - viennacl::matrix_base matB(B, ctx, B_size1, - size_type(0), - difference_type(1), - size_type(K), B_size2, - size_type(offB), - difference_type(1), - size_type(ldb) - VCL_ROW_MAJOR); - - viennacl::matrix_base matC(C, ctx, size_type(M), - size_type(0), - difference_type(1), - size_type(M), - size_type(N), - size_type(offC), - difference_type(1), - size_type(ldc) - VCL_ROW_MAJOR); + viennacl::matrix_base matA(Mem_A, ctx, + A_size1, + size_type(0), + difference_type(1), + size_type(M), + A_size2, + size_type(offA), + difference_type(1), + size_type(lda) + VCL_ROW_MAJOR); + + viennacl::matrix_base matB(Mem_B, ctx, + B_size1, + size_type(0), + difference_type(1), + size_type(K), + B_size2, + size_type(offB), + difference_type(1), + size_type(ldb) + VCL_ROW_MAJOR); + + viennacl::matrix_base matC(Mem_C, ctx, + size_type(M), + size_type(0), + difference_type(1), + size_type(M), + size_type(N), + size_type(offC), + difference_type(1), + size_type(ldc) + VCL_ROW_MAJOR); if (TransA == CblasTrans && TransB == CblasTrans) viennacl::linalg::prod_impl(viennacl::trans(matA), viennacl::trans(matB), @@ -322,56 +379,50 @@ void greentea_gpu_gemm(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, } } -template void greentea_gpu_gemm(const int_tp ctx_id, - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int_tp M, const int_tp N, - const int_tp K, const float alpha, - const cl_mem A, const int_tp offA, - const cl_mem B, const int_tp offB, - const float beta, cl_mem C, - const int_tp offC); -template void greentea_gpu_gemm(const int_tp ctx_id, - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int_tp M, const int_tp N, - const int_tp K, const double alpha, - const cl_mem A, const int_tp offA, - const cl_mem B, const int_tp offB, - const double beta, cl_mem C, - const int_tp offC); +template<> +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int_tp M, + const int_tp N, const float alpha, const float* A, + const float* x, const float beta, float* y) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufA = clState.get_buffer_mem(A); + ClMemOff bufx = clState.get_buffer_mem(x); + ClMemOff bufy = clState.get_buffer_mem(y); -template -void greentea_gpu_gemv(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, - const int_tp M, const int_tp N, const Dtype alpha, - const cl_mem A, const int_tp offA, const cl_mem x, - const int_tp offx, const Dtype beta, cl_mem y, - const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); + cl_mem Mem_A = bufA.memobj; + cl_mem Mem_x = bufx.memobj; + cl_mem Mem_y = bufy.memobj; + + int offA = bufA.offset; + int offx = bufx.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_A); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { - Dtype* Aptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), A, true, CL_MAP_READ, - sizeof(Dtype) * offA, sizeof(Dtype) * M * N, 0, NULL, NULL, NULL)); - Dtype* xptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), x, true, CL_MAP_READ, - sizeof(Dtype) * offx, sizeof(Dtype) * (TransA == CblasTrans) ? M : N, 0, + float* Aptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_A, true, CL_MAP_READ, + sizeof(float) * offA, sizeof(float) * M * N, 0, NULL, NULL, NULL)); + float* xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_x, true, CL_MAP_READ, + sizeof(float) * offx, sizeof(float) * (TransA == CblasTrans) ? M : N, 0, NULL, NULL, NULL)); - Dtype* yptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), y, true, CL_MAP_READ | CL_MAP_WRITE, - sizeof(Dtype) * offy, sizeof(Dtype) * (TransA == CblasTrans) ? N : M, 0, + float* yptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_y, true, CL_MAP_READ | CL_MAP_WRITE, + sizeof(float) * offy, sizeof(float) * (TransA == CblasTrans) ? N : M, 0, NULL, NULL, NULL)); - caffe_cpu_gemv(TransA, M, N, alpha, Aptr, xptr, beta, yptr); + caffe_cpu_gemv(TransA, M, N, alpha, Aptr, xptr, beta, yptr); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), A, Aptr, 0, NULL, - NULL); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), x, xptr, 0, NULL, - NULL); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), y, yptr, 0, NULL, - NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_A, Aptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_x, xptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_y, yptr, + 0, NULL, NULL); } else { #if defined(USE_CLBLAS) @@ -380,17 +431,10 @@ void greentea_gpu_gemv(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, cl_command_queue queue = ctx.get_queue().handle().get(); - if (std::is_same::value) { - GREENTEA_CL_BLAS_CHECK( - clblasSgemv(clblasRowMajor, - clTransA, M, N, alpha, A, offA, N, x, offx, 1, - beta, y, offy, 1, 1, &queue, 0, NULL, NULL)); - } else { - GREENTEA_CL_BLAS_CHECK( - clblasDgemv(clblasRowMajor, - clTransA, M, N, alpha, A, offA, N, x, offx, 1, - beta, y, offy, 1, 1, &queue, 0, NULL, NULL)); - } + GREENTEA_CL_BLAS_CHECK( + clblasSgemv(clblasRowMajor, + clTransA, M, N, alpha, Mem_A, offA, N, Mem_x, offx, 1, + beta, Mem_y, offy, 1, 1, &queue, 0, NULL, NULL)); #elif defined(USE_CLBLAST) @@ -404,52 +448,40 @@ void greentea_gpu_gemv(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, const size_t incx = 1; const size_t incy = 1; - if (std::is_same::value) { - GREENTEA_CLBLAST_CHECK( - clblast::Gemv( - layout, a_transpose, - M, N, - alpha, - A, offA, ldA, - x, offx, incx, - beta, - y, offy, incy, - &queue)); - } else { - GREENTEA_CLBLAST_CHECK( - clblast::Gemv( - layout, a_transpose, - M, N, - alpha, - A, offA, ldA, - x, offx, incx, - beta, - y, offy, incy, - &queue)); - } + GREENTEA_CLBLAST_CHECK( + clblast::Gemv( + layout, a_transpose, + M, N, + alpha, + Mem_A, offA, ldA, + Mem_x, offx, incx, + beta, + Mem_y, offy, incy, + &queue)); #else // default (ViennaCL) - typedef typename viennacl::vector_base::size_type size_type; - typedef typename viennacl::vector_base::size_type difference_type; - viennacl::vector_base v1( - x, size_type((TransA == CblasTrans) ? M : N), size_type(offx), + viennacl::vector_base v1( + Mem_x, size_type((TransA == CblasTrans) ? M : N), size_type(offx), difference_type(1), ctx); - viennacl::vector_base v2( - y, size_type((TransA == CblasTrans) ? N : M), size_type(offy), + viennacl::vector_base v2( + Mem_y, size_type((TransA == CblasTrans) ? N : M), size_type(offy), difference_type(1), ctx); - viennacl::matrix_base mat(A, ctx, size_type(M), - size_type(0), - difference_type(1), - size_type(M), - size_type(N), - size_type(offA), - difference_type(1), - size_type(N) - VCL_ROW_MAJOR); + viennacl::matrix_base mat(Mem_A, ctx, + size_type(M), + size_type(0), + difference_type(1), + size_type(M), + size_type(N), + size_type(offA), + difference_type(1), + size_type(N) + VCL_ROW_MAJOR); v2 *= beta; if (TransA == CblasTrans) { v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1); @@ -461,56 +493,163 @@ void greentea_gpu_gemv(const int_tp ctx_id, const CBLAS_TRANSPOSE TransA, } } -template void greentea_gpu_gemv(const int_tp ctx_id, - const CBLAS_TRANSPOSE TransA, - const int_tp M, const int_tp N, - const float alpha, const cl_mem A, - const int_tp offA, const cl_mem x, - const int_tp offx, const float beta, - cl_mem y, const int_tp offy); -template void greentea_gpu_gemv(const int_tp ctx_id, - const CBLAS_TRANSPOSE TransA, - const int_tp M, const int_tp N, - const double alpha, const cl_mem A, - const int_tp offA, const cl_mem x, - const int_tp offx, const double beta, - cl_mem y, const int_tp offy); +template<> +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int_tp M, + const int_tp N, const double alpha, const double* A, + const double* x, const double beta, double* y) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufA = clState.get_buffer_mem(A); + ClMemOff bufx = clState.get_buffer_mem(x); + ClMemOff bufy = clState.get_buffer_mem(y); -template -void greentea_gpu_axpy(const int_tp ctx_id, const int_tp N, const Dtype alpha, - const cl_mem X, const int_tp offX, cl_mem Y, - const int_tp offY) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); + cl_mem Mem_A = bufA.memobj; + cl_mem Mem_x = bufx.memobj; + cl_mem Mem_y = bufy.memobj; + + int offA = bufA.offset; + int offx = bufx.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_A); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { - Dtype* Xptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), X, true, CL_MAP_READ, - sizeof(Dtype) * offX, sizeof(Dtype) * N, 0, NULL, NULL, NULL)); - Dtype* Yptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), Y, true, CL_MAP_WRITE, - sizeof(Dtype) * offY, sizeof(Dtype) * N, 0, NULL, NULL, NULL)); - - caffe_axpy(N, alpha, Xptr, Yptr); - - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), X, Xptr, 0, NULL, - NULL); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Y, Yptr, 0, NULL, - NULL); + double* Aptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_A, true, CL_MAP_READ, + sizeof(double) * offA, sizeof(double) * M * N, 0, NULL, NULL, NULL)); + double* xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_x, true, CL_MAP_READ, + sizeof(double) * offx, + sizeof(double) * (TransA == CblasTrans) ? M : N, 0, + NULL, + NULL, NULL)); + double* yptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_y, true, + CL_MAP_READ | CL_MAP_WRITE, + sizeof(double) * offy, + sizeof(double) * (TransA == CblasTrans) ? N : M, 0, + NULL, + NULL, NULL)); + + caffe_cpu_gemv(TransA, M, N, alpha, Aptr, xptr, beta, yptr); + + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_A, Aptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_x, xptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_y, yptr, + 0, NULL, NULL); } else { #if defined(USE_CLBLAS) + clblasTranspose clTransA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + + cl_command_queue queue = ctx.get_queue().handle().get(); + + GREENTEA_CL_BLAS_CHECK( + clblasDgemv(clblasRowMajor, + clTransA, M, N, alpha, Mem_A, offA, N, Mem_x, offx, 1, + beta, Mem_y, offy, 1, 1, &queue, 0, NULL, NULL)); + +#elif defined(USE_CLBLAST) + cl_command_queue queue = ctx.get_queue().handle().get(); - if (std::is_same::value) { - GREENTEA_CL_BLAS_CHECK( - clblasSaxpy(N, alpha, X, offX, - 1, Y, offY, 1, 1, &queue, 0, NULL, NULL)); + clblast::Layout layout = clblast::Layout::kRowMajor; + clblast::Transpose a_transpose = (TransA == CblasNoTrans) ? + clblast::Transpose::kNo : clblast::Transpose::kYes; + + const size_t ldA = N; + const size_t incx = 1; + const size_t incy = 1; + + GREENTEA_CLBLAST_CHECK( + clblast::Gemv( + layout, a_transpose, + M, N, + alpha, + Mem_A, offA, ldA, + Mem_x, offx, incx, + beta, + Mem_y, offy, incy, + &queue)); + +#else // default (ViennaCL) + + typedef typename viennacl::vector_base::size_type size_type; + typedef typename viennacl::vector_base::size_type difference_type; + + viennacl::vector_base v1( + Mem_x, size_type((TransA == CblasTrans) ? M : N), size_type(offx), + difference_type(1), ctx); + viennacl::vector_base v2( + Mem_y, size_type((TransA == CblasTrans) ? N : M), size_type(offy), + difference_type(1), ctx); + viennacl::matrix_base mat(Mem_A, ctx, + size_type(M), + size_type(0), + difference_type(1), + size_type(M), + size_type(N), + size_type(offA), + difference_type(1), + size_type(N) + VCL_ROW_MAJOR); + v2 *= beta; + if (TransA == CblasTrans) { + v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1); } else { - GREENTEA_CL_BLAS_CHECK( - clblasDaxpy(N, alpha, X, offX, - 1, Y, offY, 1, 1, &queue, 0, NULL, NULL)); + v2 += alpha * viennacl::linalg::prod(mat, v1); } +#endif // clBLAS, CLBlast, or default (ViennaCL) + } +} + +template<> +void caffe_gpu_axpy(const int_tp N, const float alpha, const float* X, + float* Y) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(X); + ClMemOff bufY = clState.get_buffer_mem(Y); + + cl_mem Mem_X = bufX.memobj; + cl_mem Mem_Y = bufY.memobj; + + int offX = bufX.offset; + int offY = bufY.offset; + + int dev_id = clState.get_mem_dev(Mem_X); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + + if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { + float* Xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_X, true, CL_MAP_READ, + sizeof(float) * offX, sizeof(float) * N, 0, NULL, NULL, NULL)); + float* Yptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_Y, true, CL_MAP_WRITE, + sizeof(float) * offY, sizeof(float) * N, 0, NULL, NULL, NULL)); + + caffe_axpy(N, alpha, Xptr, Yptr); + + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_X, Xptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_Y, Yptr, + 0, NULL, NULL); + } else { +#if defined(USE_CLBLAS) + + cl_command_queue queue = ctx.get_queue().handle().get(); + + GREENTEA_CL_BLAS_CHECK( + clblasSaxpy(N, alpha, Mem_X, offX, + 1, Mem_Y, offY, 1, 1, &queue, 0, NULL, NULL)); + #elif defined(USE_CLBLAST) cl_command_queue queue = ctx.get_queue().handle().get(); @@ -518,35 +657,25 @@ void greentea_gpu_axpy(const int_tp ctx_id, const int_tp N, const Dtype alpha, const size_t incX = 1; const size_t incY = 1; - if (std::is_same::value) { - GREENTEA_CLBLAST_CHECK( - clblast::Axpy( - N, - alpha, - X, offX, incX, - Y, offY, incY, - &queue)); - } else { - GREENTEA_CLBLAST_CHECK( - clblast::Axpy( - N, - alpha, - X, offX, incX, - Y, offY, incY, - &queue)); - } + GREENTEA_CLBLAST_CHECK( + clblast::Axpy( + N, + alpha, + Mem_X, offX, incX, + Mem_Y, offY, incY, + &queue)); #else // default (ViennaCL) - typedef typename viennacl::vector_base::size_type size_type; - typedef typename viennacl::vector_base::size_type difference_type; - viennacl::vector_base v1(X, size_type(N), + viennacl::vector_base v1(Mem_X, size_type(N), size_type(offX), difference_type(1), ctx); - viennacl::vector_base v2(Y, size_type(N), + viennacl::vector_base v2(Mem_Y, size_type(N), size_type(offY), difference_type(1), ctx); v2 += alpha * v1; @@ -555,470 +684,799 @@ void greentea_gpu_axpy(const int_tp ctx_id, const int_tp N, const Dtype alpha, } } -template void greentea_gpu_axpy(const int_tp ctx_id, const int_tp N, - const float alpha, const cl_mem X, - const int_tp offX, cl_mem Y, - const int_tp offY); -template void greentea_gpu_axpy(const int_tp ctx_id, const int_tp N, - const double alpha, const cl_mem X, - const int_tp offX, cl_mem Y, - const int_tp offY); - -template -void greentea_gpu_mul(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) - ->program(); - - viennacl::ocl::kernel &oclk_mul = program.get_kernel(CL_KERNEL_SELECT("mul")); - viennacl::ocl::enqueue( - oclk_mul(N, WrapHandle(a, &ctx), offa, WrapHandle(b, &ctx), offb, - WrapHandle(y, &ctx), offy), - ctx.get_queue()); -} - -template void greentea_gpu_mul(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); -template void greentea_gpu_mul(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); +template<> +void caffe_gpu_axpy(const int_tp N, const double alpha, const double* X, + double* Y) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(X); + ClMemOff bufY = clState.get_buffer_mem(Y); -template -void greentea_gpu_div(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) - ->program(); + cl_mem Mem_X = bufX.memobj; + cl_mem Mem_Y = bufY.memobj; - viennacl::ocl::kernel &oclk_div = program.get_kernel(CL_KERNEL_SELECT("div")); - viennacl::ocl::enqueue( - oclk_div(N, WrapHandle(a, &ctx), offa, WrapHandle(b, &ctx), offb, - WrapHandle(y, &ctx), offy), - ctx.get_queue()); -} + int offX = bufX.offset; + int offY = bufY.offset; -template void greentea_gpu_div(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); -template void greentea_gpu_div(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); + int dev_id = clState.get_mem_dev(Mem_X); -template -void greentea_gpu_scal(const int_tp ctx_id, const int_tp N, const Dtype alpha, - cl_mem x, int_tp offx) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { - Dtype* xptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), x, true, CL_MAP_READ | CL_MAP_WRITE, - sizeof(Dtype) * offx, sizeof(Dtype) * N, 0, NULL, NULL, NULL)); - - caffe_scal(N, alpha, xptr); - - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), x, xptr, 0, NULL, - NULL); + double* Xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_X, true, CL_MAP_READ, + sizeof(double) * offX, sizeof(double) * N, 0, NULL, NULL, NULL)); + double* Yptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_Y, true, CL_MAP_WRITE, + sizeof(double) * offY, sizeof(double) * N, 0, NULL, NULL, NULL)); + + caffe_axpy(N, alpha, Xptr, Yptr); + + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_X, Xptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_Y, Yptr, + 0, NULL, NULL); } else { #if defined(USE_CLBLAS) cl_command_queue queue = ctx.get_queue().handle().get(); - if (std::is_same::value) { - GREENTEA_CL_BLAS_CHECK(clblasSscal(N, alpha, x, offx, - 1, 1, &queue, 0, NULL, NULL)); - } else { - GREENTEA_CL_BLAS_CHECK(clblasDscal(N, alpha, x, offx, - 1, 1, &queue, 0, NULL, NULL)); - } + GREENTEA_CL_BLAS_CHECK( + clblasDaxpy(N, alpha, Mem_X, offX, + 1, Mem_Y, offY, 1, 1, &queue, 0, NULL, NULL)); #elif defined(USE_CLBLAST) cl_command_queue queue = ctx.get_queue().handle().get(); - const size_t incx = 1; + const size_t incX = 1; + const size_t incY = 1; - if (std::is_same::value) { - GREENTEA_CLBLAST_CHECK( - clblast::Scal( - N, - alpha, - x, offx, incx, - &queue)); - } else { - GREENTEA_CLBLAST_CHECK( - clblast::Scal( - N, - alpha, - x, offx, incx, - &queue)); - } + GREENTEA_CLBLAST_CHECK( + clblast::Axpy( + N, + alpha, + Mem_X, offX, incX, + Mem_Y, offY, incY, + &queue)); #else // default (ViennaCL) - typedef typename viennacl::vector_base::size_type size_type; - typedef typename viennacl::vector_base::size_type difference_type; - viennacl::vector_base v1(x, size_type(N), - size_type(offx), + viennacl::vector_base v1(Mem_X, size_type(N), + size_type(offX), difference_type(1), ctx); - v1 *= alpha; + viennacl::vector_base v2(Mem_Y, size_type(N), + size_type(offY), + difference_type(1), ctx); + v2 += alpha * v1; #endif // clBLAS, CLBlast, or default (ViennaCL) } } -template void greentea_gpu_scal(const int_tp ctx_id, const int_tp N, - const float alpha, cl_mem x, - const int_tp offx); -template void greentea_gpu_scal(const int_tp ctx_id, const int_tp N, - const double alpha, cl_mem x, - const int_tp offx); - -template -void greentea_gpu_axpby(const int_tp ctx_id, const int_tp N, const Dtype alpha, - const cl_mem X, const int_tp offX, const Dtype beta, - cl_mem Y, const int_tp offY) { - greentea_gpu_scal(ctx_id, N, beta, Y, offY); - greentea_gpu_axpy(ctx_id, N, alpha, X, offX, Y, offY); +void caffe_gpu_memcpy(const uint_tp N, const void* X, void* Y) { + if (X == Y) return; + + ClState& clState = Caffe::cl_state(); + + ClMemOff bufX = clState.get_buffer_mem(X); + ClMemOff bufY = clState.get_buffer_mem(Y); + int dev_id; + if (bufX.memobj != NULL) dev_id = clState.get_mem_dev(bufX.memobj); + else + dev_id = clState.get_mem_dev(bufY.memobj); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + + if (bufX.memobj != NULL && bufY.memobj != NULL) { + clEnqueueCopyBuffer(ctx.get_queue().handle().get(), bufX.memobj, + bufY.memobj, bufX.offset, + bufY.offset, N, 0, NULL, NULL); + } else if (bufX.memobj != NULL) { + clEnqueueReadBuffer(ctx.get_queue().handle().get(), bufX.memobj, + CL_TRUE, bufX.offset, N, + Y, 0, NULL, NULL); + } else if (bufY.memobj != NULL) { + clEnqueueWriteBuffer(ctx.get_queue().handle().get(), bufY.memobj, + CL_TRUE, bufY.offset, N, + X, 0, NULL, NULL); + } else { + memcpy(Y, X, N); + } } -template void greentea_gpu_axpby(const int_tp ctx_id, const int_tp N, - const float alpha, const cl_mem X, - const int_tp offX, const float beta, - cl_mem Y, const int_tp offY); +template<> +void caffe_gpu_scal(const int_tp N, const float alpha, float *X) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(X); -template void greentea_gpu_axpby(const int_tp ctx_id, const int_tp N, - const double alpha, const cl_mem X, - const int_tp offX, const double beta, - cl_mem Y, const int_tp offY); + cl_mem Mem_X = bufX.memobj; -template -void greentea_gpu_dot(const int_tp ctx_id, const int_tp n, const cl_mem X, - const int_tp offX, const cl_mem Y, const int_tp offY, - Dtype* out) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); + int offX = bufX.offset; - if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { - Dtype* Xptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), X, true, CL_MAP_READ, - sizeof(Dtype) * offX, sizeof(Dtype) * n, 0, NULL, NULL, NULL)); - Dtype* Yptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), Y, true, CL_MAP_READ, - sizeof(Dtype) * offY, sizeof(Dtype) * n, 0, NULL, NULL, NULL)); + int dev_id = clState.get_mem_dev(Mem_X); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); - *out = caffe_cpu_dot(n, Xptr, Yptr); + if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { + float* xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_X, true, CL_MAP_READ | CL_MAP_WRITE, + sizeof(float) * offX, sizeof(float) * N, 0, NULL, NULL, NULL)); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), X, Xptr, 0, NULL, - NULL); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Y, Yptr, 0, NULL, - NULL); + caffe_scal(N, alpha, xptr); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_X, xptr, + 0, NULL, NULL); } else { #if defined(USE_CLBLAS) cl_command_queue queue = ctx.get_queue().handle().get(); - cl_int err; - cl_mem gpuout = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, - sizeof(Dtype), NULL, &err); - cl_mem scratch = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, - n * sizeof(Dtype), NULL, &err); - - if (std::is_same::value) { - GREENTEA_CL_BLAS_CHECK( - clblasSdot(n, gpuout, 0, X, offX, 1, Y, - offY, 1, scratch, 1, &queue, 0, NULL, NULL)); - } else { - GREENTEA_CL_BLAS_CHECK( - clblasDdot(n, gpuout, 0, X, offX, 1, Y, - offY, 1, scratch, 1, &queue, 0, NULL, NULL)); - } - - greentea_gpu_memcpy(sizeof(Dtype), gpuout, 0, out, &ctx); - - clReleaseMemObject(gpuout); - clReleaseMemObject(scratch); + GREENTEA_CL_BLAS_CHECK(clblasSscal(N, alpha, Mem_X, offX, + 1, 1, &queue, 0, NULL, NULL)); #elif defined(USE_CLBLAST) cl_command_queue queue = ctx.get_queue().handle().get(); - cl_int err = CL_SUCCESS; - cl_mem Z = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, - sizeof(Dtype), NULL, &err); - // TODO: error handling. - - const size_t offZ = 0; - const size_t incX = 1; - const size_t incY = 1; - - if (std::is_same::value) { - GREENTEA_CLBLAST_CHECK( - clblast::Dot( - n, - Z, offZ, - X, offX, incX, - Y, offY, incY, - &queue)); - } else { - GREENTEA_CLBLAST_CHECK( - clblast::Dot( - n, - Z, offZ, - X, offX, incX, - Y, offY, incY, - &queue)); - } + const size_t incx = 1; - greentea_gpu_memcpy(sizeof(Dtype), Z, offZ, out, &ctx); - clReleaseMemObject(Z); + GREENTEA_CLBLAST_CHECK( + clblast::Scal( + N, + alpha, + Mem_X, offX, incx, + &queue)); #else // default (ViennaCL) - typedef typename viennacl::vector_base::size_type size_type; - typedef typename viennacl::vector_base::size_type difference_type; - viennacl::vector_base v1(X, size_type(n), + viennacl::vector_base v1(Mem_X, size_type(N), size_type(offX), difference_type(1), ctx); - viennacl::vector_base v2(Y, size_type(n), - size_type(offY), - difference_type(1), ctx); - - *out = viennacl::linalg::inner_prod(v1, v2); + v1 *= alpha; #endif // clBLAS, CLBlast, or default (ViennaCL) } } -template void greentea_gpu_dot(const int_tp ctx_id, const int_tp n, - const cl_mem X, const int_tp offX, - const cl_mem Y, const int_tp offY, - float* out); -template void greentea_gpu_dot(const int_tp ctx_id, const int_tp n, - const cl_mem X, const int_tp offX, - const cl_mem Y, const int_tp offY, - double* out); +template<> +void caffe_gpu_scal(const int_tp N, const double alpha, double *X) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(X); -template -void greentea_gpu_asum(const int_tp ctx_id, const int_tp n, const cl_mem X, - const int_tp offX, Dtype* Y) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); + cl_mem Mem_X = bufX.memobj; + + int offX = bufX.offset; + + int dev_id = clState.get_mem_dev(Mem_X); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { - Dtype* Xptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), X, true, CL_MAP_READ, - sizeof(Dtype) * offX, sizeof(Dtype) * n, 0, NULL, NULL, NULL)); + double* xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_X, true, CL_MAP_READ | CL_MAP_WRITE, + sizeof(double) * offX, sizeof(double) * N, 0, NULL, NULL, NULL)); - *Y = caffe_cpu_asum(n, Xptr); + caffe_scal(N, alpha, xptr); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), X, Xptr, 0, NULL, - NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_X, xptr, + 0, NULL, NULL); } else { #if defined(USE_CLBLAS) cl_command_queue queue = ctx.get_queue().handle().get(); - cl_int err; - cl_mem gpuout = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, - sizeof(Dtype), NULL, &err); - cl_mem scratch = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, - n * sizeof(Dtype), NULL, &err); - - if (std::is_same::value) { - GREENTEA_CL_BLAS_CHECK( - clblasSasum(n, gpuout, 0, X, offX, 1, - scratch, 1, &queue, 0, NULL, NULL)); - } else { - GREENTEA_CL_BLAS_CHECK( - clblasDasum(n, gpuout, 0, X, offX, 1, - scratch, 1, &queue, 0, NULL, NULL)); - } - - greentea_gpu_memcpy(sizeof(Dtype), gpuout, 0, Y, &ctx); - - clReleaseMemObject(gpuout); - clReleaseMemObject(scratch); + GREENTEA_CL_BLAS_CHECK(clblasDscal(N, alpha, Mem_X, offX, + 1, 1, &queue, 0, NULL, NULL)); #elif defined(USE_CLBLAST) cl_command_queue queue = ctx.get_queue().handle().get(); - cl_int err = CL_SUCCESS; - cl_mem Z = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, - sizeof(Dtype), NULL, &err); - // TODO: error handling. - - const size_t offZ = 0; - const size_t incX = 1; - - if (std::is_same::value) { - GREENTEA_CLBLAST_CHECK( - clblast::Asum( - n, - Z, offZ, - X, offX, incX, - &queue)); - } else { - GREENTEA_CLBLAST_CHECK( - clblast::Asum( - n, - Z, offZ, - X, offX, incX, - &queue)); - } - - greentea_gpu_memcpy(sizeof(Dtype), Z, offZ, Y, &ctx); + const size_t incx = 1; - clReleaseMemObject(Z); + GREENTEA_CLBLAST_CHECK( + clblast::Scal( + N, + alpha, + Mem_X, offX, incx, + &queue)); #else // default (ViennaCL) - typedef typename viennacl::vector_base::size_type size_type; - typedef typename viennacl::vector_base::size_type difference_type; - viennacl::vector_base v1(X, size_type(n), + viennacl::vector_base v1(Mem_X, size_type(N), size_type(offX), difference_type(1), ctx); - - *Y = viennacl::linalg::norm_1(v1); + v1 *= alpha; #endif // clBLAS, CLBlast, or default (ViennaCL) } } -template void greentea_gpu_asum(const int_tp ctx_id, const int_tp n, - const cl_mem X, const int_tp offX, - float* Y); -template void greentea_gpu_asum(const int_tp ctx_id, const int_tp n, - const cl_mem X, const int_tp offX, - double* Y); +template<> +void caffe_gpu_axpby(const int_tp N, const float alpha, const float* X, + const float beta, float* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); +} -template -void greentea_gpu_scale(const int_tp ctx_id, const int_tp n, const Dtype alpha, - const cl_mem X, const int_tp offX, cl_mem Y, - const int_tp offY) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); +template<> +void caffe_gpu_axpby(const int_tp N, const double alpha, + const double* X, const double beta, double* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); +} + +template<> +void caffe_gpu_dot(const int_tp n, const float* x, const float* y, + float* out) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(x); + ClMemOff bufY = clState.get_buffer_mem(y); + + cl_mem Mem_X = bufX.memobj; + cl_mem Mem_Y = bufY.memobj; + + int offX = bufX.offset; + int offY = bufY.offset; + + int dev_id = clState.get_mem_dev(Mem_X); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { - Dtype* Xptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), X, true, CL_MAP_READ, - sizeof(Dtype) * offX, sizeof(Dtype) * n, 0, NULL, NULL, NULL)); - Dtype* Yptr = reinterpret_cast(clEnqueueMapBuffer( - ctx.get_queue().handle().get(), Y, true, CL_MAP_WRITE, - sizeof(Dtype) * offY, sizeof(Dtype) * n, 0, NULL, NULL, NULL)); - - caffe_cpu_scale(n, alpha, Xptr, Yptr); - - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), X, Xptr, 0, NULL, - NULL); - clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Y, Yptr, 0, NULL, - NULL); + float* Xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_X, true, CL_MAP_READ, + sizeof(float) * offX, sizeof(float) * n, 0, NULL, NULL, NULL)); + float* Yptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_Y, true, CL_MAP_READ, + sizeof(float) * offY, sizeof(float) * n, 0, NULL, NULL, NULL)); + + *out = caffe_cpu_dot(n, Xptr, Yptr); + + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_X, Xptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_Y, Yptr, + 0, NULL, NULL); + } else { #if defined(USE_CLBLAS) - // FIXME: Remove, as can reuse ctx obtained above? - viennacl::ocl::context ctx = viennacl::ocl::get_context(ctx_id); cl_command_queue queue = ctx.get_queue().handle().get(); - // FIXME: Use xAXPY with beta = 0? - if (std::is_same::value) { - GREENTEA_CL_BLAS_CHECK( - clblasScopy(n, X, offX, 1, Y, offY, 1, 1, &queue, 0, NULL, NULL)); - GREENTEA_CL_BLAS_CHECK( - clblasSscal(n, alpha, Y, offY, 1, 1, &queue, 0, NULL, NULL)); - } else { - GREENTEA_CL_BLAS_CHECK( - clblasDcopy(n, X, offX, 1, Y, offY, 1, 1, &queue, 0, NULL, NULL)); - GREENTEA_CL_BLAS_CHECK( - clblasDscal(n, alpha, Y, offY, 1, 1, &queue, 0, NULL, NULL)); - } + cl_int err; + float* memDotP = static_cast(clState.create_buffer(dev_id, + CL_MEM_WRITE_ONLY, sizeof(cl_float), NULL, &err)); + float* memScratch = static_cast(clState.create_buffer(dev_id, + CL_MEM_READ_WRITE, n * sizeof(cl_float), NULL, &err)); + ClMemOff bufDotP = clState.get_buffer_mem(memDotP); + ClMemOff bufScratch = clState.get_buffer_mem(memScratch); + + GREENTEA_CL_BLAS_CHECK( + clblasSdot(n, bufDotP.memobj, 0, Mem_X, offX, 1, Mem_Y, + offY, 1, bufScratch.memobj, 1, &queue, 0, NULL, NULL)); + + caffe_gpu_memcpy(sizeof(float), memDotP, out); + + clState.destroy_buffer(memScratch); + clState.destroy_buffer(memDotP); #elif defined(USE_CLBLAST) cl_command_queue queue = ctx.get_queue().handle().get(); + cl_int err = CL_SUCCESS; + + float* Z = static_cast(clState.create_buffer(dev_id, + CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err)); + ClMemOff bufZ = clState.get_buffer_mem(Z); + cl_mem Mem_Z = bufZ.memobj; + // TODO: error handling. + + const size_t offZ = 0; const size_t incX = 1; const size_t incY = 1; - if (std::is_same::value) { - GREENTEA_CLBLAST_CHECK( - clblast::Copy( - n, - X, offX, incX, - Y, offY, incY, - &queue)); - GREENTEA_CLBLAST_CHECK( - clblast::Scal( - n, - alpha, - Y, offY, incY, - &queue)); - } else { - GREENTEA_CLBLAST_CHECK( - clblast::Copy( - n, - X, offX, incX, - Y, offY, incY, - &queue)); - GREENTEA_CLBLAST_CHECK( - clblast::Scal( - n, - alpha, - Y, offY, incY, - &queue)); - } + GREENTEA_CLBLAST_CHECK( + clblast::Dot( + n, + Mem_Z, offZ, + Mem_X, offX, incX, + Mem_Y, offY, incY, + &queue)); + + caffe_gpu_memcpy(sizeof(float), Z, out); + clState.destroy_buffer(Z); #else // default (ViennaCL) - typedef typename viennacl::vector_base::size_type size_type; - typedef typename viennacl::vector_base::size_type difference_type; - viennacl::vector_base v1(X, size_type(n), + viennacl::vector_base v1(Mem_X, size_type(n), size_type(offX), difference_type(1), ctx); - viennacl::vector_base v2(Y, size_type(n), + viennacl::vector_base v2(Mem_Y, size_type(n), size_type(offY), difference_type(1), ctx); - v2 = v1 * alpha; + *out = viennacl::linalg::inner_prod(v1, v2); #endif // clBLAS, CLBlast, or default (ViennaCL) } } -template void greentea_gpu_scale(const int_tp ctx_id, const int_tp n, - const float alpha, const cl_mem X, - const int_tp offX, cl_mem Y, - const int_tp offY); +template<> +void caffe_gpu_dot(const int_tp n, const double* x, const double* y, + double * out) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(x); + ClMemOff bufY = clState.get_buffer_mem(y); -template void greentea_gpu_scale(const int_tp ctx_id, const int_tp n, - const double alpha, const cl_mem X, - const int_tp offX, cl_mem Y, - const int_tp offY); + cl_mem Mem_X = bufX.memobj; + cl_mem Mem_Y = bufY.memobj; -template -void greentea_gpu_set(const int_tp ctx_id, const int_tp N, const Dtype alpha, - cl_mem Y, const int_tp offY) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) + int offX = bufX.offset; + int offY = bufY.offset; + + int dev_id = clState.get_mem_dev(Mem_X); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + + if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { + double* Xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_X, true, CL_MAP_READ, + sizeof(double) * offX, sizeof(double) * n, 0, NULL, NULL, NULL)); + double* Yptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_Y, true, CL_MAP_READ, + sizeof(double) * offY, sizeof(double) * n, 0, NULL, NULL, NULL)); + + *out = caffe_cpu_dot(n, Xptr, Yptr); + + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_X, Xptr, 0, NULL, + NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_Y, Yptr, 0, NULL, + NULL); + + } else { +#if defined(USE_CLBLAS) + + cl_command_queue queue = ctx.get_queue().handle().get(); + + cl_int err; + double* memDotP = static_cast(clState.create_buffer(dev_id, + CL_MEM_WRITE_ONLY, sizeof(cl_double), NULL, &err)); + double* memScratch = static_cast(clState.create_buffer(dev_id, + CL_MEM_READ_WRITE, n * sizeof(cl_float), NULL, &err)); + ClMemOff bufDotP = clState.get_buffer_mem(memDotP); + ClMemOff bufScratch = clState.get_buffer_mem(memScratch); + + GREENTEA_CL_BLAS_CHECK( + clblasDdot(n, bufDotP.memobj, 0, Mem_X, offX, 1, Mem_Y, + offY, 1, bufScratch.memobj, 1, &queue, 0, NULL, NULL)); + + caffe_gpu_memcpy(sizeof(double), memDotP, out); + + clState.destroy_buffer(memScratch); + clState.destroy_buffer(memDotP); + +#elif defined(USE_CLBLAST) + + cl_command_queue queue = ctx.get_queue().handle().get(); + + cl_int err = CL_SUCCESS; + + double* Z = static_cast(clState.create_buffer(dev_id, + CL_MEM_READ_WRITE, sizeof(cl_double), NULL, &err)); + ClMemOff bufZ = clState.get_buffer_mem(Z); + cl_mem Mem_Z = bufZ.memobj; + // TODO: error handling. + + const size_t offZ = 0; + const size_t incX = 1; + const size_t incY = 1; + + GREENTEA_CLBLAST_CHECK( + clblast::Dot( + n, + Mem_Z, offZ, + Mem_X, offX, incX, + Mem_Y, offY, incY, + &queue)); + + caffe_gpu_memcpy(sizeof(double), Z, out); + clState.destroy_buffer(Z); + +#else // default (ViennaCL) + + typedef typename viennacl::vector_base::size_type size_type; + typedef typename viennacl::vector_base::size_type difference_type; + + viennacl::vector_base v1(Mem_X, size_type(n), + size_type(offX), + difference_type(1), ctx); + viennacl::vector_base v2(Mem_Y, size_type(n), + size_type(offY), + difference_type(1), ctx); + + *out = viennacl::linalg::inner_prod(v1, v2); + +#endif // clBLAS, CLBlast, or default (ViennaCL) + } +} + +template<> +void caffe_gpu_asum(const int_tp n, const float* x, float* y) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(x); + + cl_mem Mem_X = bufX.memobj; + + int offX = bufX.offset; + + int dev_id = clState.get_mem_dev(Mem_X); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + + if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { + float* Xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_X, true, CL_MAP_READ, + sizeof(float) * offX, sizeof(float) * n, 0, NULL, NULL, NULL)); + + *y = caffe_cpu_asum(n, Xptr); + + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_X, Xptr, + 0, NULL, NULL); + } else { +#if defined(USE_CLBLAS) + + cl_command_queue queue = ctx.get_queue().handle().get(); + + cl_int err; + float* memAsum = static_cast(clState.create_buffer(dev_id, + CL_MEM_WRITE_ONLY, sizeof(cl_float), NULL, &err)); + float* memScratch = static_cast(clState.create_buffer(dev_id, + CL_MEM_READ_WRITE, n * sizeof(cl_float), NULL, &err)); + ClMemOff bufAsum = clState.get_buffer_mem(memAsum); + ClMemOff bufScratch = clState.get_buffer_mem(memScratch); + + GREENTEA_CL_BLAS_CHECK( + clblasSasum(n, bufAsum.memobj, 0, Mem_X, offX, 1, + bufScratch.memobj, 1, &queue, 0, NULL, NULL)); + + caffe_gpu_memcpy(sizeof(float), memAsum, y); + + clState.destroy_buffer(memScratch); + clState.destroy_buffer(memAsum); + +#elif defined(USE_CLBLAST) + + cl_command_queue queue = ctx.get_queue().handle().get(); + + cl_int err = CL_SUCCESS; + float* Z = static_cast(clState.create_buffer(dev_id, + CL_MEM_WRITE_ONLY, sizeof(cl_float), NULL, &err)); + ClMemOff bufZ = clState.get_buffer_mem(Z); + cl_mem Mem_Z = bufZ.memobj; + // TODO: error handling. + + const size_t offZ = 0; + const size_t incX = 1; + + GREENTEA_CLBLAST_CHECK( + clblast::Asum( + n, + Mem_Z, offZ, + Mem_X, offX, incX, + &queue)); + + caffe_gpu_memcpy(sizeof(float), Z, Y); + + clState.destroy_buffer(Z); + +#else // default (ViennaCL) + + typedef typename viennacl::vector_base::size_type size_type; + typedef typename viennacl::vector_base::size_type difference_type; + + viennacl::vector_base v1(Mem_X, size_type(n), + size_type(offX), + difference_type(1), ctx); + + *y = viennacl::linalg::norm_1(v1); + +#endif // clBLAS, CLBlast, or default (ViennaCL) + } +} + +template<> +void caffe_gpu_asum(const int_tp n, const double* x, double* y) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(x); + + cl_mem Mem_X = bufX.memobj; + + int offX = bufX.offset; + + int dev_id = clState.get_mem_dev(Mem_X); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + + if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { + double* Xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_X, true, CL_MAP_READ, + sizeof(double) * offX, sizeof(double) * n, 0, NULL, NULL, NULL)); + + *y = caffe_cpu_asum(n, Xptr); + + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_X, Xptr, + 0, NULL, NULL); + } else { +#if defined(USE_CLBLAS) + + cl_command_queue queue = ctx.get_queue().handle().get(); + + cl_int err; + double* memAsum = static_cast(clState.create_buffer(dev_id, + CL_MEM_WRITE_ONLY, sizeof(cl_double), NULL, &err)); + double* memScratch = static_cast(clState.create_buffer(dev_id, + CL_MEM_READ_WRITE, n * sizeof(cl_double), NULL, &err)); + ClMemOff bufAsum = clState.get_buffer_mem(memAsum); + ClMemOff bufScratch = clState.get_buffer_mem(memScratch); + + GREENTEA_CL_BLAS_CHECK( + clblasDasum(n, bufAsum.memobj, 0, Mem_X, offX, 1, + bufScratch.memobj, 1, &queue, 0, NULL, NULL)); + + caffe_gpu_memcpy(sizeof(double), memAsum, y); + + clState.destroy_buffer(memScratch); + clState.destroy_buffer(memAsum); + +#elif defined(USE_CLBLAST) + + cl_command_queue queue = ctx.get_queue().handle().get(); + + cl_int err = CL_SUCCESS; + double* Z = static_cast(clState.create_buffer(dev_id, + CL_MEM_WRITE_ONLY, sizeof(cl_double), NULL, &err)); + ClMemOff bufZ = clState.get_buffer_mem(Z); + cl_mem Mem_Z = bufZ.memobj; + // TODO: error handling. + + const size_t offZ = 0; + const size_t incX = 1; + + GREENTEA_CLBLAST_CHECK( + clblast::Asum( + n, + Mem_Z, offZ, + Mem_X, offX, incX, + &queue)); + + caffe_gpu_memcpy(sizeof(double), Z, Y); + + clState.destroy_buffer(Z); + +#else // default (ViennaCL) + + typedef typename viennacl::vector_base::size_type size_type; + typedef typename viennacl::vector_base::size_type difference_type; + + viennacl::vector_base v1(Mem_X, size_type(n), + size_type(offX), + difference_type(1), ctx); + + *y = viennacl::linalg::norm_1(v1); + +#endif // clBLAS, CLBlast, or default (ViennaCL) + } +} + +template<> +void caffe_gpu_scale(const int_tp n, const float alpha, const float *x, + float* y) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(x); + ClMemOff bufY = clState.get_buffer_mem(y); + + cl_mem Mem_X = bufX.memobj; + cl_mem Mem_Y = bufY.memobj; + + int offX = bufX.offset; + int offY = bufY.offset; + + int dev_id = clState.get_mem_dev(Mem_X); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + + if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { + float* Xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_X, true, CL_MAP_READ, + sizeof(float) * offX, sizeof(float) * n, 0, NULL, NULL, NULL)); + float* Yptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_Y, true, CL_MAP_WRITE, + sizeof(float) * offY, sizeof(float) * n, 0, NULL, NULL, NULL)); + + caffe_cpu_scale(n, alpha, Xptr, Yptr); + + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_X, Xptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_Y, Yptr, + 0, NULL, NULL); + } else { +#if defined(USE_CLBLAS) + + // FIXME: Remove, as can reuse ctx obtained above? + cl_command_queue queue = ctx.get_queue().handle().get(); + + // FIXME: Use xAXPY with beta = 0? + GREENTEA_CL_BLAS_CHECK( + clblasScopy(n, Mem_X, offX, 1, Mem_Y, offY, 1, 1, &queue, 0, NULL, NULL)); + GREENTEA_CL_BLAS_CHECK( + clblasSscal(n, alpha, Mem_Y, offY, 1, 1, &queue, 0, NULL, NULL)); + +#elif defined(USE_CLBLAST) + + cl_command_queue queue = ctx.get_queue().handle().get(); + + const size_t incX = 1; + const size_t incY = 1; + + GREENTEA_CLBLAST_CHECK( + clblast::Copy( + n, + Mem_X, offX, incX, + Mem_Y, offY, incY, + &queue)); + GREENTEA_CLBLAST_CHECK( + clblast::Scal( + n, + alpha, + Mem_Y, offY, incY, + &queue)); + +#else // default (ViennaCL) + + typedef typename viennacl::vector_base::size_type size_type; + typedef typename viennacl::vector_base::size_type difference_type; + + viennacl::vector_base v1(Mem_X, size_type(n), + size_type(offX), + difference_type(1), ctx); + viennacl::vector_base v2(Mem_Y, size_type(n), + size_type(offY), + difference_type(1), ctx); + + v2 = v1 * alpha; + +#endif // clBLAS, CLBlast, or default (ViennaCL) + } +} + +template<> +void caffe_gpu_scale(const int_tp n, const double alpha, + const double *x, double* y) { + ClState& clState = Caffe::cl_state(); + ClMemOff bufX = clState.get_buffer_mem(x); + ClMemOff bufY = clState.get_buffer_mem(y); + + cl_mem Mem_X = bufX.memobj; + cl_mem Mem_Y = bufY.memobj; + + int offX = bufX.offset; + int offY = bufY.offset; + + int dev_id = clState.get_mem_dev(Mem_X); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + + if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { + double* Xptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_X, true, CL_MAP_READ, + sizeof(double) * offX, sizeof(double) * n, 0, NULL, NULL, NULL)); + double* Yptr = reinterpret_cast(clEnqueueMapBuffer( + ctx.get_queue().handle().get(), Mem_Y, true, CL_MAP_WRITE, + sizeof(double) * offY, sizeof(double) * n, 0, NULL, NULL, NULL)); + + caffe_cpu_scale(n, alpha, Xptr, Yptr); + + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_X, Xptr, + 0, NULL, NULL); + clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), Mem_Y, Yptr, + 0, NULL, NULL); + } else { +#if defined(USE_CLBLAS) + + // FIXME: Remove, as can reuse ctx obtained above? + cl_command_queue queue = ctx.get_queue().handle().get(); + + // FIXME: Use xAXPY with beta = 0? + GREENTEA_CL_BLAS_CHECK( + clblasDcopy(n, Mem_X, offX, 1, Mem_Y, offY, 1, 1, &queue, 0, NULL, NULL)); + GREENTEA_CL_BLAS_CHECK( + clblasDscal(n, alpha, Mem_Y, offY, 1, 1, &queue, 0, NULL, NULL)); + +#elif defined(USE_CLBLAST) + + cl_command_queue queue = ctx.get_queue().handle().get(); + + const size_t incX = 1; + const size_t incY = 1; + + GREENTEA_CLBLAST_CHECK( + clblast::Copy( + n, + Mem_X, offX, incX, + Mem_Y, offY, incY, + &queue)); + GREENTEA_CLBLAST_CHECK( + clblast::Scal>( + n, + alpha, + Mem_Y, offY, incY, + &queue)); + +#else // default (ViennaCL) + + typedef typename viennacl::vector_base::size_type size_type; + typedef typename viennacl::vector_base::size_type difference_type; + + viennacl::vector_base v1(Mem_X, size_type(n), + size_type(offX), + difference_type(1), ctx); + viennacl::vector_base v2(Mem_Y, size_type(n), + size_type(offY), + difference_type(1), ctx); + + v2 = v1 * alpha; + +#endif // clBLAS, CLBlast, or default (ViennaCL) + } +} + +template +void caffe_gpu_set(const int_tp N, const Dtype alpha, Dtype* Y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufY = clState.get_buffer_mem(Y); + + cl_mem Mem_Y = bufY.memobj; + + int offY = bufY.offset; + + int dev_id = clState.get_mem_dev(Mem_Y); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); + // OpenCL Version >= 1.2 approach // clEnqueueFillBuffer(ctx.get_queue().handle().get(), // Y, &alpha, sizeof(Dtype), @@ -1027,263 +1485,612 @@ void greentea_gpu_set(const int_tp ctx_id, const int_tp N, const Dtype alpha, // OpenCL Version < 1.2 fallback viennacl::ocl::kernel &oclk_fill = program.get_kernel( CL_KERNEL_SELECT("fill")); - viennacl::ocl::enqueue(oclk_fill(N, alpha, WrapHandle(Y, &ctx), offY), + viennacl::ocl::enqueue(oclk_fill(N, alpha, WrapHandle(Mem_Y, &ctx), offY), ctx.get_queue()); } -template void greentea_gpu_set(const int_tp ctx_id, const int_tp N, - const int_tp alpha, cl_mem Y, - const int_tp offY); -template void greentea_gpu_set(const int_tp ctx_id, const int_tp N, - const float alpha, cl_mem Y, - const int_tp offY); -template void greentea_gpu_set(const int_tp ctx_id, const int_tp N, - const double alpha, cl_mem Y, - const int_tp offY); +template void caffe_gpu_set(const int_tp N, const int_tp alpha, + int_tp* Y); +template void caffe_gpu_set(const int_tp N, const float alpha, float* Y); +template void caffe_gpu_set(const int_tp N, const double alpha, + double* Y); + +template +void caffe_gpu_sign(const int_tp n, const Dtype* x, Dtype* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufx= clState.get_buffer_mem(x); + ClMemOff bufy= clState.get_buffer_mem(y); + + cl_mem Mem_x = bufx.memobj; + cl_mem Mem_y = bufy.memobj; + + int offx = bufx.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_x); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &oclk_sign = program.get_kernel( + CL_KERNEL_SELECT("sign")); + viennacl::ocl::enqueue( + oclk_sign(n, WrapHandle(Mem_x, &ctx), offx, + WrapHandle(Mem_y, &ctx), offy), + ctx.get_queue()); +} + +template void caffe_gpu_sign(const int_tp n, + const float* x, + float* y); +template void caffe_gpu_sign(const int_tp n, + const double* x, + double* y); template -void greentea_gpu_add_scalar(const int_tp ctx_id, const int_tp N, - const Dtype alpha, cl_mem Y, const int_tp offY) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) +void caffe_gpu_sgnbit(const int_tp n, const Dtype* x, Dtype* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufx= clState.get_buffer_mem(x); + ClMemOff bufy= clState.get_buffer_mem(y); + + cl_mem Mem_x = bufx.memobj; + cl_mem Mem_y = bufy.memobj; + + int offx = bufx.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_x); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); + viennacl::ocl::kernel &oclk_sgnbit = program.get_kernel( + CL_KERNEL_SELECT("sgnbit")); + viennacl::ocl::enqueue( + oclk_sgnbit(n, WrapHandle(Mem_x, &ctx), offx, + WrapHandle(Mem_y, &ctx), offy), + ctx.get_queue()); +} + +template void caffe_gpu_sgnbit(const int_tp n, + const float* x, + float* y); +template void caffe_gpu_sgnbit(const int_tp n, + const double* x, + double* y); + +template<> +void caffe_gpu_add_scalar(const int_tp N, const float alpha, float* Y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufY = clState.get_buffer_mem(Y); + + cl_mem Mem_Y = bufY.memobj; + + int offY = bufY.offset; + + int dev_id = clState.get_mem_dev(Mem_Y); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); viennacl::ocl::kernel &oclk_add_scalar = program.get_kernel( - CL_KERNEL_SELECT("add_scalar")); - viennacl::ocl::enqueue(oclk_add_scalar(N, alpha, WrapHandle(Y, &ctx), offY), + // CL_KERNEL_SELECT("add_scalar")); + "add_scalar" "_float"); + viennacl::ocl::enqueue(oclk_add_scalar(N, alpha, + WrapHandle(Mem_Y, &ctx), offY), ctx.get_queue()); } -template void greentea_gpu_add_scalar(const int_tp ctx_id, - const int_tp N, const float alpha, - cl_mem Y, const int_tp offY); -template void greentea_gpu_add_scalar(const int_tp ctx_id, - const int_tp N, - const double alpha, cl_mem Y, - const int_tp offY); +template<> +void caffe_gpu_add_scalar(const int_tp N, const double alpha, double* Y) { + ClState& clState = Caffe::cl_state(); -template -void greentea_gpu_add(const int_tp ctx_id, const int_tp n, const cl_mem a, - const int_tp offa, const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) + ClMemOff bufY = clState.get_buffer_mem(Y); + + cl_mem Mem_Y = bufY.memobj; + + int offY = bufY.offset; + + int dev_id = clState.get_mem_dev(Mem_Y); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); + viennacl::ocl::kernel &oclk_add_scalar = program.get_kernel( + // CL_KERNEL_SELECT("add_scalar")); + "add_scalar" "_double"); + viennacl::ocl::enqueue(oclk_add_scalar(N, alpha, + WrapHandle(Mem_Y, &ctx), offY), + ctx.get_queue()); +} + +template<> +void caffe_gpu_add(const int_tp N, const float* a, const float* b, + float* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufb = clState.get_buffer_mem(b); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_b = bufb.memobj; + cl_mem Mem_y = bufy.memobj; - viennacl::ocl::kernel &oclk_add = program.get_kernel(CL_KERNEL_SELECT("add")); + int offa = bufa.offset; + int offb = bufb.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &oclk_add = program.get_kernel("add" "_float"); viennacl::ocl::enqueue( - oclk_add(n, WrapHandle(a, &ctx), offa, WrapHandle(b, &ctx), offb, - WrapHandle(y, &ctx), offy), + oclk_add(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_b, &ctx), offb, + WrapHandle(Mem_y, &ctx), offy), ctx.get_queue()); } -template void greentea_gpu_add(const int_tp ctx_id, const int_tp n, - const cl_mem a, const int_tp offa, - const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); -template void greentea_gpu_add(const int_tp ctx_id, const int_tp n, - const cl_mem a, const int_tp offa, - const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); +template<> +void caffe_gpu_add(const int_tp N, const double* a, const double* b, + double* y) { + ClState& clState = Caffe::cl_state(); -template -void greentea_gpu_sub(const int_tp ctx_id, const int_tp n, const cl_mem a, - const int_tp offa, const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufb = clState.get_buffer_mem(b); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_b = bufb.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offb = bufb.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); - viennacl::ocl::kernel &oclk_sub = program.get_kernel(CL_KERNEL_SELECT("sub")); + viennacl::ocl::kernel &oclk_add = program.get_kernel("add" "_double"); viennacl::ocl::enqueue( - oclk_sub(n, WrapHandle(a, &ctx), offa, WrapHandle(b, &ctx), offb, - WrapHandle(y, &ctx), offy), + oclk_add(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_b, &ctx), offb, + WrapHandle(Mem_y, &ctx), offy), ctx.get_queue()); } -template void greentea_gpu_sub(const int_tp ctx_id, const int_tp n, - const cl_mem a, const int_tp offa, - const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); -template void greentea_gpu_sub(const int_tp ctx_id, const int_tp n, - const cl_mem a, const int_tp offa, - const cl_mem b, const int_tp offb, - cl_mem y, const int_tp offy); +template<> +void caffe_gpu_sub(const int_tp N, const float* a, const float* b, + float* y) { + ClState& clState = Caffe::cl_state(); -template -void greentea_gpu_abs(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, cl_mem y, const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufb = clState.get_buffer_mem(b); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_b = bufb.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offb = bufb.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); - viennacl::ocl::kernel &oclk_abs = program.get_kernel(CL_KERNEL_SELECT("abs")); + viennacl::ocl::kernel &oclk_sub = program.get_kernel("sub" "_float"); viennacl::ocl::enqueue( - oclk_abs(N, WrapHandle(a, &ctx), offa, WrapHandle(y, &ctx), offy), + oclk_sub(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_b, &ctx), offb, + WrapHandle(Mem_y, &ctx), offy), ctx.get_queue()); } -template void greentea_gpu_abs(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - cl_mem y, const int_tp offy); -template void greentea_gpu_abs(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - cl_mem y, const int_tp offy); +template<> +void caffe_gpu_sub(const int_tp N, const double* a, const double* b, + double* y) { + ClState& clState = Caffe::cl_state(); -template -void greentea_gpu_exp(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, cl_mem y, const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufb = clState.get_buffer_mem(b); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_b = bufb.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offb = bufb.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); - viennacl::ocl::kernel &oclk_exp = program.get_kernel(CL_KERNEL_SELECT("exp")); + viennacl::ocl::kernel &oclk_sub = program.get_kernel("sub" "_double"); viennacl::ocl::enqueue( - oclk_exp(N, WrapHandle(a, &ctx), offa, WrapHandle(y, &ctx), offy), + oclk_sub(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_b, &ctx), offb, + WrapHandle(Mem_y, &ctx), offy), ctx.get_queue()); } -template void greentea_gpu_exp(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - cl_mem y, const int_tp offy); -template void greentea_gpu_exp(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - cl_mem y, const int_tp offy); +template<> +void caffe_gpu_mul(const int_tp N, const float* a, const float* b, + float* y) { + ClState& clState = Caffe::cl_state(); -template -void greentea_gpu_powx(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, const Dtype alpha, cl_mem y, - const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufb = clState.get_buffer_mem(b); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_b = bufb.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offb = bufb.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); - viennacl::ocl::kernel &oclk_powx = program.get_kernel( - CL_KERNEL_SELECT("powx")); + viennacl::ocl::kernel &oclk_mul = program.get_kernel("mul" "_float"); viennacl::ocl::enqueue( - oclk_powx(N, WrapHandle(a, &ctx), offa, alpha, WrapHandle(y, &ctx), offy), + oclk_mul(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_b, &ctx), offb, + WrapHandle(Mem_y, &ctx), offy), ctx.get_queue()); } -template void greentea_gpu_powx(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - const float alpha, cl_mem y, - const int_tp offy); -template void greentea_gpu_powx(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - const double alpha, cl_mem y, - const int_tp offy); +template<> +void caffe_gpu_mul(const int_tp N, const double* a, const double* b, + double* y) { + ClState& clState = Caffe::cl_state(); -template -void greentea_gpu_log(const int_tp ctx_id, const int_tp N, const cl_mem a, - const int_tp offa, cl_mem y, const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufb = clState.get_buffer_mem(b); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_b = bufb.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offb = bufb.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); - viennacl::ocl::kernel &oclk_log = program.get_kernel(CL_KERNEL_SELECT("log")); + viennacl::ocl::kernel &oclk_mul = program.get_kernel("mul" "_double"); viennacl::ocl::enqueue( - oclk_log(N, WrapHandle(a, &ctx), offa, WrapHandle(y, &ctx), offy), + oclk_mul(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_b, &ctx), offb, + WrapHandle(Mem_y, &ctx), offy), ctx.get_queue()); } -template void greentea_gpu_log(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - cl_mem y, const int_tp offy); -template void greentea_gpu_log(const int_tp ctx_id, const int_tp N, - const cl_mem a, const int_tp offa, - cl_mem y, const int_tp offy); +template<> +void caffe_gpu_div(const int_tp N, const float* a, const float* b, + float* y) { + ClState& clState = Caffe::cl_state(); -template -void greentea_gpu_sign(const int_tp ctx_id, const int_tp n, const cl_mem x, -int_tp offx, - cl_mem y, const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufb = clState.get_buffer_mem(b); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_b = bufb.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offb = bufb.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); - viennacl::ocl::kernel &oclk_sign = program.get_kernel( - CL_KERNEL_SELECT("sign")); + viennacl::ocl::kernel &oclk_div = program.get_kernel("div" "_float"); viennacl::ocl::enqueue( - oclk_sign(n, WrapHandle(x, &ctx), offx, WrapHandle(y, &ctx), offy), + oclk_div(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_b, &ctx), offb, + WrapHandle(Mem_y, &ctx), offy), ctx.get_queue()); } -template void greentea_gpu_sign(const int_tp ctx_id, const int_tp n, - const cl_mem x, int_tp offx, cl_mem y, - const int_tp offy); -template void greentea_gpu_sign(const int_tp ctx_id, const int_tp n, - const cl_mem x, int_tp offx, cl_mem y, - const int_tp offy); +template<> +void caffe_gpu_div(const int_tp N, const double* a, const double* b, + double* y) { + ClState& clState = Caffe::cl_state(); -template -void greentea_gpu_sgnbit(const int_tp ctx_id, const int_tp n, const cl_mem x, -int_tp offx, - cl_mem y, const int_tp offy) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - viennacl::ocl::program &program = (Caffe::Get().GetDevice(ctx_id, false)) + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufb = clState.get_buffer_mem(b); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_b = bufb.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offb = bufb.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) ->program(); - viennacl::ocl::kernel &oclk_sgnbit = program.get_kernel( - CL_KERNEL_SELECT("sgnbit")); + viennacl::ocl::kernel &oclk_div = program.get_kernel("div" "_double"); + viennacl::ocl::enqueue( + oclk_div(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_b, &ctx), offb, + WrapHandle(Mem_y, &ctx), offy), + ctx.get_queue()); +} + +template<> +void caffe_gpu_abs(const int_tp N, const float* a, float* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &oclk_abs = program.get_kernel("abs" "_float"); + viennacl::ocl::enqueue( + oclk_abs(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_y, &ctx), offy), + ctx.get_queue()); +} + +template<> +void caffe_gpu_abs(const int_tp N, const double* a, double* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &oclk_abs = program.get_kernel("abs" "_double"); + viennacl::ocl::enqueue( + oclk_abs(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_y, &ctx), offy), + ctx.get_queue()); +} + +template<> +void caffe_gpu_exp(const int_tp N, const float* a, float* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &oclk_exp = program.get_kernel("exp" "_float"); viennacl::ocl::enqueue( - oclk_sgnbit(n, WrapHandle(x, &ctx), offx, WrapHandle(y, &ctx), offy), + oclk_exp(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_y, &ctx), offy), ctx.get_queue()); } -template void greentea_gpu_sgnbit(const int_tp ctx_id, const int_tp n, - const cl_mem x, int_tp offx, cl_mem y, - const int_tp offy); -template void greentea_gpu_sgnbit(const int_tp ctx_id, const int_tp n, - const cl_mem x, int_tp offx, cl_mem y, - const int_tp offy); +template<> +void caffe_gpu_exp(const int_tp N, const double* a, double* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); -void greentea_gpu_rng_uniform(const int_tp ctx_id, const int_tp n, cl_mem r, -int_tp offr) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &oclk_exp = program.get_kernel("exp" "_double"); + viennacl::ocl::enqueue( + oclk_exp(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_y, &ctx), offy), + ctx.get_queue()); +} + +template<> +void caffe_gpu_log(const int_tp N, const float* a, float* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &oclk_log = program.get_kernel("log" "_float"); + viennacl::ocl::enqueue( + oclk_log(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_y, &ctx), offy), + ctx.get_queue()); +} + +template<> +void caffe_gpu_log(const int_tp N, const double* a, double* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &oclk_log = program.get_kernel("log" "double"); + viennacl::ocl::enqueue( + oclk_log(N, WrapHandle(Mem_a, &ctx), offa, WrapHandle(Mem_y, &ctx), offy), + ctx.get_queue()); +} + +template<> +void caffe_gpu_powx(const int_tp N, const float* a, const float alpha, + float* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &oclk_powx = program.get_kernel( + "powx" "_float"); + viennacl::ocl::enqueue( + oclk_powx(N, WrapHandle(Mem_a, &ctx), offa, alpha, + WrapHandle(Mem_y, &ctx), offy), + ctx.get_queue()); +} + +template<> +void caffe_gpu_powx(const int_tp N, const double* a, const double alpha, + double* y) { + ClState& clState = Caffe::cl_state(); + + ClMemOff bufa = clState.get_buffer_mem(a); + ClMemOff bufy = clState.get_buffer_mem(y); + + cl_mem Mem_a = bufa.memobj; + cl_mem Mem_y = bufy.memobj; + + int offa = bufa.offset; + int offy = bufy.offset; + + int dev_id = clState.get_mem_dev(Mem_a); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + viennacl::ocl::program &program = (Caffe::Get().GetDevice(dev_id, false)) + ->program(); + + viennacl::ocl::kernel &oclk_powx = program.get_kernel( + "powx" "_double"); + viennacl::ocl::enqueue( + oclk_powx(N, WrapHandle(Mem_a, &ctx), offa, alpha, + WrapHandle(Mem_y, &ctx), offy), + ctx.get_queue()); +} + +void caffe_gpu_rng_uniform(const int_tp n, unsigned int* r) { std::vector random(n); //NOLINT caffe_rng_uniform(n, &random[0]); - greentea_gpu_memcpy(sizeof(uint_tp) * n, &random[0], r, offr, &ctx); + caffe_gpu_memcpy(sizeof(uint_tp) * n, &random[0], r); } -template -void greentea_gpu_rng_uniform(const int_tp ctx_id, const int_tp n, - const Dtype a, const Dtype b, cl_mem r, - const int_tp offr) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - std::vector random(n); // NOLINT +template<> +void caffe_gpu_rng_uniform(const int_tp n, const float a, const float b, + float* r) { + std::vector random(n); // NOLINT caffe_rng_uniform(n, a, b, &random[0]); - greentea_gpu_memcpy(sizeof(Dtype) * n, &random[0], r, offr, &ctx); + caffe_gpu_memcpy(sizeof(float) * n, &random[0], r); } -template void greentea_gpu_rng_uniform(const int_tp ctx_id, - const int_tp n, const float a, - const float b, cl_mem r, - const int_tp offr); -template void greentea_gpu_rng_uniform(const int_tp ctx_id, - const int_tp n, const double a, - const double b, cl_mem r, - const int_tp offr); +template<> +void caffe_gpu_rng_uniform(const int_tp n, const double a, + const double b, double* r) { + std::vector random(n); // NOLINT + caffe_rng_uniform(n, a, b, &random[0]); + caffe_gpu_memcpy(sizeof(double) * n, &random[0], r); +} -template -void greentea_gpu_rng_gaussian(const int_tp ctx_id, const int_tp n, - const Dtype mu, const Dtype sigma, cl_mem r, - const int_tp offr) { - viennacl::ocl::context &ctx = viennacl::ocl::get_context(ctx_id); - std::vector random(n); // NOLINT +template<> +void caffe_gpu_rng_gaussian(const int_tp n, const float mu, const float sigma, + float* r) { + std::vector random(n); // NOLINT caffe_rng_gaussian(n, mu, sigma, &random[0]); - greentea_gpu_memcpy(sizeof(Dtype) * n, &random[0], r, offr, &ctx); + caffe_gpu_memcpy(sizeof(float) * n, &random[0], r); } -template void greentea_gpu_rng_gaussian(const int_tp ctx_id, - const int_tp n, const float mu, - const float sigma, cl_mem r, - const int_tp offr); - -template void greentea_gpu_rng_gaussian(const int_tp ctx_id, - const int_tp n, const double mu, - const double sigma, cl_mem r, - const int_tp offr); - +template<> +void caffe_gpu_rng_gaussian(const int_tp n, const double mu, const double sigma, + double* r) { + std::vector random(n); // NOLINT + caffe_rng_gaussian(n, mu, sigma, &random[0]); + caffe_gpu_memcpy(sizeof(double) * n, &random[0], r); +} } // namespace caffe -#endif +#endif // USE_GREENTEA diff --git a/src/caffe/greentea/libdnn.cpp b/src/caffe/greentea/libdnn.cpp index 0b3bd09b34f..fa662cd41a8 100644 --- a/src/caffe/greentea/libdnn.cpp +++ b/src/caffe/greentea/libdnn.cpp @@ -1685,18 +1685,24 @@ void LibDNNConv::Forward(const Dtype* bottom_data, // << kernel.global_work_size(i) << std::endl; // } + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_weight = clState.get_buffer_mem(weight); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + ClMemOff buf_bias = clState.get_buffer_mem(bias); + if (bias_term_) { viennacl::ocl::enqueue( - kernel(WrapHandle((cl_mem)bottom_data, &ctx), - WrapHandle((cl_mem)weight, &ctx), - WrapHandle((cl_mem)bias, &ctx), - WrapHandle((cl_mem)top_data, &ctx)), + kernel(WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_weight.memobj, &ctx), + WrapHandle(buf_bias.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); } else { viennacl::ocl::enqueue( - kernel(WrapHandle((cl_mem)bottom_data, &ctx), - WrapHandle((cl_mem)weight, &ctx), - WrapHandle((cl_mem)top_data, &ctx)), + kernel(WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_weight.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); } } @@ -1781,18 +1787,24 @@ void LibDNNConv::Backward(bool prop_down_data, bool prop_down_weights, // << kernel.global_work_size(i) << std::endl; // } + ClState& clState = Caffe::cl_state(); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_weight = clState.get_buffer_mem(weight); + ClMemOff buf_bias = clState.get_buffer_mem(bias); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + if (bias_term_) { viennacl::ocl::enqueue( - kernel(WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) weight, &ctx), - WrapHandle((cl_mem) bias, &ctx), - WrapHandle((cl_mem) bottom_diff, &ctx)), + kernel(WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_weight.memobj, &ctx), + WrapHandle(buf_bias.memobj, &ctx), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); } else { viennacl::ocl::enqueue( - kernel(WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) weight, &ctx), - WrapHandle((cl_mem) bottom_diff, &ctx)), + kernel(WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_weight.memobj, &ctx), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); } } @@ -1824,18 +1836,24 @@ void LibDNNConv::Backward(bool prop_down_data, bool prop_down_weights, // << kernel.global_work_size(i) << std::endl; // } + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_bias = clState.get_buffer_mem(bias_diff); + ClMemOff buf_weight = clState.get_buffer_mem(weight_diff); + if (bias_term_) { viennacl::ocl::enqueue( - kernel(WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) bias_diff, &ctx), - WrapHandle((cl_mem) weight_diff, &ctx), batch_size), + kernel(WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_bias.memobj, &ctx), + WrapHandle(buf_weight.memobj, &ctx), batch_size), ctx.get_queue()); } else { viennacl::ocl::enqueue( - kernel(WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) weight_diff, &ctx), batch_size), + kernel(WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_weight.memobj, &ctx), batch_size), ctx.get_queue()); } } @@ -2014,8 +2032,11 @@ void LibDNNConv::SetMemory(Dtype* memory, int_tp count, kernel.global_work_size(1, 1); kernel.global_work_size(2, 1); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_memory = clState.get_buffer_mem(memory); + viennacl::ocl::enqueue(kernel(count, value, - WrapHandle((cl_mem)memory, &ctx), offset), + WrapHandle(buf_memory.memobj, &ctx), offset), ctx.get_queue()); #endif // USE_GREENTEA } else { diff --git a/src/caffe/layers/absval_layer.cu b/src/caffe/layers/absval_layer.cu index f19933fea65..9fee9afd136 100644 --- a/src/caffe/layers/absval_layer.cu +++ b/src/caffe/layers/absval_layer.cu @@ -5,7 +5,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -15,17 +14,8 @@ void AbsValLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { const int_tp count = top[0]->count(); Dtype* top_data = top[0]->mutable_gpu_data(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_abs(this->device_->id(), count, - (cl_mem) (bottom[0]->gpu_data()), 0, - (cl_mem) (top_data), 0); -#endif // USE_GREENTEA - } + + caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); } template @@ -38,21 +28,8 @@ void AbsValLayer::Backward_gpu(const vector*>& top, const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_sign(count, bottom_data, bottom_diff); - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_sign(this->device_->id(), count, - (cl_mem) bottom_data, 0, (cl_mem) bottom_diff, - 0); - greentea_gpu_mul(this->device_->id(), count, - (cl_mem) bottom_diff, 0, (cl_mem) top_diff, 0, - (cl_mem) bottom_diff, 0); -#endif // USE_GREENTEA - } + caffe_gpu_sign(count, bottom_data, bottom_diff); + caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); } } diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index eeba7772017..bc41432be70 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -8,8 +8,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_im2col.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -367,43 +365,19 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, const int_tp output_off, bool skip_im2col) { const Dtype* col_buff = input; - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (!is_1x1_) { + if (!is_1x1_) { if (!skip_im2col) { - conv_im2col_gpu(input + input_off, col_buffer()->mutable_gpu_data()); + conv_im2col_gpu(input + input_off, col_buffer()->mutable_gpu_data()); } col_buff = col_buffer()->gpu_data(); - } - for (int_tp g = 0; g < group_; ++g) { + } + for (int_tp g = 0; g < group_; ++g) { caffe_gpu_gemm( CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_, (Dtype) 1., weights + weight_offset_ * g, col_buff + (is_1x1_ ? input_off : 0) + col_offset_ * g, (Dtype) 0., output + output_off + output_offset_ * g); - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - if (!is_1x1_) { - if (!skip_im2col) { - greentea_conv_im2col_gpu(input, input_off, - col_buffer()->mutable_gpu_data(), 0); - } - col_buff = col_buffer()->gpu_data(); - } - for (int_tp g = 0; g < group_; ++g) { - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - CblasNoTrans, conv_out_channels_ / group_, - conv_out_spatial_dim_, kernel_dim_, - (Dtype) 1., (cl_mem) weights, weight_offset_ * g, - (cl_mem) col_buff, - (is_1x1_ ? input_off : 0) + col_offset_ * g, - (Dtype) 0., (cl_mem) output, - output_off + output_offset_ * g); - } -#endif // USE_GREENTEA } } @@ -411,22 +385,10 @@ template void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, const int_tp output_off, const Dtype* bias) { - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - out_spatial_dim_, 1, (Dtype) 1., bias, - bias_multiplier_.gpu_data(), (Dtype) 1., - output + output_off); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - CblasNoTrans, num_output_, out_spatial_dim_, 1, - (Dtype) 1., (cl_mem) bias, 0, - (cl_mem) (bias_multiplier_.gpu_data()), 0, - (Dtype) 1., (cl_mem) output, output_off); -#endif // USE_GREENTEA - } + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, + out_spatial_dim_, 1, (Dtype) 1., bias, + bias_multiplier_.gpu_data(), (Dtype) 1., + output + output_off); } template @@ -439,34 +401,15 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, if (is_1x1_) { col_buff = input; } - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - for (int_tp g = 0; g < group_; ++g) { - caffe_gpu_gemm( - CblasTrans, CblasNoTrans, kernel_dim_, conv_out_spatial_dim_, - conv_out_channels_ / group_, (Dtype) 1., weights + weight_offset_ * g, - output + output_off + output_offset_ * g, (Dtype) 0., - col_buff + (is_1x1_ ? input_off : 0) + col_offset_ * g); - } - if (!is_1x1_) { - conv_col2im_gpu(col_buff, input + input_off); - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - for (int_tp g = 0; g < group_; ++g) { - greentea_gpu_gemm(this->device_->id(), CblasTrans, - CblasNoTrans, kernel_dim_, conv_out_spatial_dim_, - conv_out_channels_ / group_, (Dtype) 1., - (cl_mem) weights, weight_offset_ * g, - (cl_mem) output, output_off + output_offset_ * g, - (Dtype) 0., (cl_mem) col_buff, - (is_1x1_ ? input_off : 0) + col_offset_ * g); - } - if (!is_1x1_) { - greentea_conv_col2im_gpu(col_buff, 0, input, input_off); - } -#endif // USE_GREENTEA + for (int_tp g = 0; g < group_; ++g) { + caffe_gpu_gemm( + CblasTrans, CblasNoTrans, kernel_dim_, conv_out_spatial_dim_, + conv_out_channels_ / group_, (Dtype) 1., weights + weight_offset_ * g, + output + output_off + output_offset_ * g, (Dtype) 0., + col_buff + (is_1x1_ ? input_off : 0) + col_offset_ * g); + } + if (!is_1x1_) { + conv_col2im_gpu(col_buff, input + input_off); } } @@ -477,39 +420,17 @@ void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, const int_tp output_off, Dtype* weights) { const Dtype* col_buff = input; - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (!is_1x1_) { + if (!is_1x1_) { conv_im2col_gpu(input + input_off, col_buffer()->mutable_gpu_data()); col_buff = col_buffer()->gpu_data(); - } - for (int_tp g = 0; g < group_; ++g) { - caffe_gpu_gemm( - CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_, - conv_out_spatial_dim_, (Dtype) 1., - output + output_off + output_offset_ * g, - col_buff + (is_1x1_ ? input_off : 0) + col_offset_ * g, (Dtype) 1., - weights + weight_offset_ * g); - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - if (!is_1x1_) { - greentea_conv_im2col_gpu(input, input_off, - col_buffer()->mutable_gpu_data(), 0); - col_buff = col_buffer()->gpu_data(); - } - for (int_tp g = 0; g < group_; ++g) { - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - CblasTrans, conv_out_channels_ / group_, - kernel_dim_, conv_out_spatial_dim_, (Dtype) 1., - (cl_mem) output, output_off + output_offset_ * g, - (cl_mem) col_buff, - (is_1x1_ ? input_off : 0) + col_offset_ * g, - (Dtype) 1., (cl_mem) weights, - weight_offset_ * g); - } -#endif // USE_GREENTEA + } + for (int_tp g = 0; g < group_; ++g) { + caffe_gpu_gemm( + CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_, + conv_out_spatial_dim_, (Dtype) 1., + output + output_off + output_offset_ * g, + col_buff + (is_1x1_ ? input_off : 0) + col_offset_ * g, (Dtype) 1., + weights + weight_offset_ * g); } } @@ -517,20 +438,9 @@ template void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, const Dtype* input, const int_tp input_off) { - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_gemv(CblasNoTrans, num_output_, out_spatial_dim_, 1., - input + input_off, bias_multiplier_.gpu_data(), 1., - bias); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, - num_output_, out_spatial_dim_, 1., (cl_mem) input, - input_off, (cl_mem) (bias_multiplier_.gpu_data()), - 0, 1., (cl_mem) bias, 0); -#endif // USE_GREENTEA - } + caffe_gpu_gemv(CblasNoTrans, num_output_, out_spatial_dim_, 1., + input + input_off, bias_multiplier_.gpu_data(), 1., + bias); } template diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/base_data_layer.cu index 50659bdf9df..5e3b8e8781c 100644 --- a/src/caffe/layers/base_data_layer.cu +++ b/src/caffe/layers/base_data_layer.cu @@ -10,46 +10,18 @@ void BasePrefetchingDataLayer::Forward_gpu( Batch* batch = prefetch_full_.pop("Data layer prefetch queue empty"); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - // Reshape to loaded data. - top[0]->ReshapeLike(batch->data_); - // Copy the data - caffe_copy(batch->data_.count(), batch->data_.gpu_data(), - top[0]->mutable_gpu_data()); - if (this->output_labels_) { - // Reshape to loaded labels. - top[1]->ReshapeLike(batch->label_); - // Copy the labels. - caffe_copy(batch->label_.count(), batch->label_.gpu_data(), - top[1]->mutable_gpu_data()); - } - // Ensure the copy is synchronous wrt the host, so that the next batch isn't - // copied in meanwhile. - CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - // Reshape to loaded data. - top[0]->ReshapeLike(batch->data_); - // Copy the data - greentea_copy(batch->data_.count(), - (cl_mem) (batch->data_.gpu_data()), 0, - (cl_mem) (top[0]->mutable_gpu_data()), 0, &ctx); - if (this->output_labels_) { - // Reshape to loaded labels. - top[1]->ReshapeLike(batch->label_); - // Copy the labels. - greentea_copy(batch->label_.count(), - (cl_mem) (batch->label_.gpu_data()), 0, - (cl_mem) (top[1]->mutable_gpu_data()), 0, &ctx); - } -#endif // USE_GREENTEA + // Reshape to loaded data. + top[0]->ReshapeLike(batch->data_); + // Copy the data + caffe_copy(batch->data_.count(), batch->data_.gpu_data(), + top[0]->mutable_gpu_data()); + if (this->output_labels_) { + // Reshape to loaded labels. + top[1]->ReshapeLike(batch->label_); + // Copy the labels. + caffe_copy(batch->label_.count(), batch->label_.gpu_data(), + top[1]->mutable_gpu_data()); } - prefetch_free_.push(batch); } diff --git a/src/caffe/layers/batch_norm_layer.cu b/src/caffe/layers/batch_norm_layer.cu index b2142bbac72..4034cd66b44 100644 --- a/src/caffe/layers/batch_norm_layer.cu +++ b/src/caffe/layers/batch_norm_layer.cu @@ -14,386 +14,165 @@ void BatchNormLayer::Forward_gpu(const vector*>& bottom, int_tp num = bottom[0]->shape(0); int_tp spatial_dim = bottom[0]->count() / (channels_ * bottom[0]->shape(0)); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (bottom[0] != top[0]) { - caffe_copy(bottom[0]->count(), bottom_data, top_data); - } - - if (use_global_stats_) { - // use the stored mean/variance estimates. - const Dtype scale_factor = - this->blobs_[2]->cpu_data()[0] == 0 ? - 0 : 1 / this->blobs_[2]->cpu_data()[0]; - caffe_gpu_scale(variance_.count(), scale_factor, - this->blobs_[0]->gpu_data(), mean_.mutable_gpu_data()); - caffe_gpu_scale(variance_.count(), scale_factor, - this->blobs_[1]->gpu_data(), - variance_.mutable_gpu_data()); - } else { - // compute mean - caffe_gpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, - 1. / (num * spatial_dim), bottom_data, - spatial_sum_multiplier_.gpu_data(), 0., - num_by_chans_.mutable_gpu_data()); - caffe_gpu_gemv(CblasTrans, num, channels_, 1., - num_by_chans_.gpu_data(), - batch_sum_multiplier_.gpu_data(), 0., - mean_.mutable_gpu_data()); - } - - // subtract mean - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1, - batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), - 0., num_by_chans_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, channels_ * num, - spatial_dim, 1, -1, num_by_chans_.gpu_data(), - spatial_sum_multiplier_.gpu_data(), 1., top_data); - - if (!use_global_stats_) { - // compute variance using var(X) = E((X-EX)^2) - caffe_gpu_powx(top[0]->count(), top_data, Dtype(2), - temp_.mutable_gpu_data()); // (X-EX)^2 - caffe_gpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, - 1. / (num * spatial_dim), temp_.gpu_data(), - spatial_sum_multiplier_.gpu_data(), 0., - num_by_chans_.mutable_gpu_data()); - caffe_gpu_gemv(CblasTrans, num, channels_, 1., - num_by_chans_.gpu_data(), - batch_sum_multiplier_.gpu_data(), 0., - variance_.mutable_gpu_data()); // E((X_EX)^2) - - // compute and save moving average - this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_; - this->blobs_[2]->mutable_cpu_data()[0] += 1; - caffe_gpu_axpby(mean_.count(), Dtype(1), mean_.gpu_data(), - moving_average_fraction_, - this->blobs_[0]->mutable_gpu_data()); - int_tp m = bottom[0]->count() / channels_; - Dtype bias_correction_factor = m > 1 ? Dtype(m) / (m - 1) : 1; - caffe_gpu_axpby(variance_.count(), bias_correction_factor, - variance_.gpu_data(), moving_average_fraction_, - this->blobs_[1]->mutable_gpu_data()); - } - - // normalize variance - caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); - caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), - variance_.mutable_gpu_data()); - - // replicate variance to input size - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1, - batch_sum_multiplier_.gpu_data(), - variance_.gpu_data(), 0., - num_by_chans_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, channels_ * num, - spatial_dim, 1, 1., num_by_chans_.gpu_data(), - spatial_sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); - // TODO(cdoersch): The caching is only needed because later in-place layers - // might clobber the data. Can we skip this if they won't? - caffe_copy(x_norm_.count(), top_data, x_norm_.mutable_gpu_data()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - if (bottom[0] != top[0]) { - greentea_copy(bottom[0]->count(), (cl_mem) bottom_data, 0, - (cl_mem) top_data, 0, &ctx); - } - - if (use_global_stats_) { - // use the stored mean/variance estimates. - const Dtype scale_factor = - this->blobs_[2]->cpu_data()[0] == 0 ? - 0 : 1 / this->blobs_[2]->cpu_data()[0]; - greentea_gpu_scale(this->device_->id(), variance_.count(), - scale_factor, - (cl_mem) (this->blobs_[0]->gpu_data()), 0, - (cl_mem) (mean_.mutable_gpu_data()), 0); - greentea_gpu_scale(this->device_->id(), variance_.count(), - scale_factor, - (cl_mem) (this->blobs_[1]->gpu_data()), 0, - (cl_mem) (variance_.mutable_gpu_data()), 0); - } else { - // compute mean - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, - channels_ * num, spatial_dim, - 1. / (num * spatial_dim), (cl_mem) bottom_data, - 0, (cl_mem) (spatial_sum_multiplier_.gpu_data()), - 0, 0., - (cl_mem) (num_by_chans_.mutable_gpu_data()), 0); - greentea_gpu_gemv(this->device_->id(), CblasTrans, num, channels_, - 1., (cl_mem) (num_by_chans_.gpu_data()), 0, - (cl_mem) (batch_sum_multiplier_.gpu_data()), 0, - 0., (cl_mem) (mean_.mutable_gpu_data()), 0); - } - - // subtract mean - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, CblasNoTrans, - num, channels_, 1, 1, - (cl_mem) (batch_sum_multiplier_.gpu_data()), 0, - (cl_mem) (mean_.gpu_data()), 0, 0., - (cl_mem) (num_by_chans_.mutable_gpu_data()), 0); - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, CblasNoTrans, - channels_ * num, spatial_dim, 1, -1, - (cl_mem) (num_by_chans_.gpu_data()), 0, - (cl_mem) (spatial_sum_multiplier_.gpu_data()), 0, - 1., (cl_mem) top_data, 0); - - if (!use_global_stats_) { - // compute variance using var(X) = E((X-EX)^2) - greentea_gpu_powx(this->device_->id(), top[0]->count(), - (cl_mem) top_data, 0, Dtype(2), - (cl_mem) (temp_.mutable_gpu_data()), 0); - // (X-EX)^2 - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, - channels_ * num, spatial_dim, - 1. / (num * spatial_dim), - (cl_mem) (temp_.gpu_data()), 0, - (cl_mem) (spatial_sum_multiplier_.gpu_data()), 0, - 0., (cl_mem) (num_by_chans_.mutable_gpu_data()), - 0); - greentea_gpu_gemv(this->device_->id(), CblasTrans, num, channels_, - 1., (cl_mem) (num_by_chans_.gpu_data()), 0, - (cl_mem) (batch_sum_multiplier_.gpu_data()), 0, - 0., (cl_mem) (variance_.mutable_gpu_data()), 0); - // E((X_EX)^2) - - // compute and save moving average - this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_; - this->blobs_[2]->mutable_cpu_data()[0] += 1; - greentea_gpu_axpby(this->device_->id(), mean_.count(), Dtype(1), - (cl_mem) (mean_.gpu_data()), 0, - moving_average_fraction_, - (cl_mem) (this->blobs_[0]->mutable_gpu_data()), - 0); - int_tp m = bottom[0]->count() / channels_; - Dtype bias_correction_factor = m > 1 ? Dtype(m) / (m - 1) : 1; - greentea_gpu_axpby(this->device_->id(), variance_.count(), - bias_correction_factor, - (cl_mem) (variance_.gpu_data()), 0, - moving_average_fraction_, - (cl_mem) (this->blobs_[1]->mutable_gpu_data()), - 0); - } - - // normalize variance - greentea_gpu_add_scalar(this->device_->id(), variance_.count(), eps_, - (cl_mem) (variance_.mutable_gpu_data()), 0); - greentea_gpu_powx(this->device_->id(), variance_.count(), - (cl_mem) (variance_.gpu_data()), 0, Dtype(0.5), - (cl_mem) (variance_.mutable_gpu_data()), 0); - - // replicate variance to input size - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, CblasNoTrans, - num, channels_, 1, 1, - (cl_mem) (batch_sum_multiplier_.gpu_data()), 0, - (cl_mem) (variance_.gpu_data()), 0, 0., - (cl_mem) (num_by_chans_.mutable_gpu_data()), 0); - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, CblasNoTrans, - channels_ * num, spatial_dim, 1, 1., - (cl_mem) (num_by_chans_.gpu_data()), 0, - (cl_mem) (spatial_sum_multiplier_.gpu_data()), 0, - 0., (cl_mem) (temp_.mutable_gpu_data()), 0); - greentea_gpu_div(this->device_->id(), temp_.count(), - (cl_mem) top_data, 0, (cl_mem) (temp_.gpu_data()), - 0, (cl_mem) top_data, 0); - // TODO(cdoersch): The caching is only needed because later in-place layers - // might clobber the data. Can we skip this if they won't? - greentea_copy(x_norm_.count(), (cl_mem) top_data, 0, - (cl_mem) (x_norm_.mutable_gpu_data()), 0, &ctx); -#endif // USE_GREENTEA + if (bottom[0] != top[0]) { + caffe_copy(bottom[0]->count(), bottom_data, top_data); } -} - -template -void BatchNormLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* top_diff; - - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (bottom[0] != top[0]) { - top_diff = top[0]->gpu_diff(); - } else { - caffe_copy(x_norm_.count(), top[0]->gpu_diff(), - x_norm_.mutable_gpu_diff()); - top_diff = x_norm_.gpu_diff(); - } - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (use_global_stats_) { - caffe_gpu_div(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff); - return; - } - const Dtype* top_data = x_norm_.gpu_data(); - int_tp num = bottom[0]->shape()[0]; - int_tp spatial_dim = bottom[0]->count() / (channels_ * bottom[0]->shape(0)); - // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then - // - // dE(Y)/dX = - // (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y) - // ./ sqrt(var(X) + eps) - // - // where \cdot and ./ are hadamard product and elementwise division, - // respectively, dE/dY is the top diff, and mean/var/sum are all computed - // along all dimensions except the channels dimension. In the above - // equation, the operations allow for expansion (i.e. broadcast) along all - // dimensions except the channels dimension where required. - // sum(dE/dY \cdot Y) - caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); - caffe_gpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, 1., - bottom_diff, spatial_sum_multiplier_.gpu_data(), 0., + if (use_global_stats_) { + // use the stored mean/variance estimates. + const Dtype scale_factor = + this->blobs_[2]->cpu_data()[0] == 0 ? + 0 : 1 / this->blobs_[2]->cpu_data()[0]; + caffe_gpu_scale(variance_.count(), scale_factor, + this->blobs_[0]->gpu_data(), mean_.mutable_gpu_data()); + caffe_gpu_scale(variance_.count(), scale_factor, + this->blobs_[1]->gpu_data(), + variance_.mutable_gpu_data()); + } else { + // compute mean + caffe_gpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, + 1. / (num * spatial_dim), bottom_data, + spatial_sum_multiplier_.gpu_data(), 0., num_by_chans_.mutable_gpu_data()); caffe_gpu_gemv(CblasTrans, num, channels_, 1., num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + } - // reshape (broadcast) the above - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1, - batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), - 0., num_by_chans_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, channels_ * num, - spatial_dim, 1, 1., num_by_chans_.gpu_data(), - spatial_sum_multiplier_.gpu_data(), 0., bottom_diff); - - // sum(dE/dY \cdot Y) \cdot Y - caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); - - // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y - caffe_gpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, 1., - top_diff, spatial_sum_multiplier_.gpu_data(), 0., + // subtract mean + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1, + batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), + 0., num_by_chans_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, channels_ * num, + spatial_dim, 1, -1, num_by_chans_.gpu_data(), + spatial_sum_multiplier_.gpu_data(), 1., top_data); + + if (!use_global_stats_) { + // compute variance using var(X) = E((X-EX)^2) + caffe_gpu_powx(top[0]->count(), top_data, Dtype(2), + temp_.mutable_gpu_data()); // (X-EX)^2 + caffe_gpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, + 1. / (num * spatial_dim), temp_.gpu_data(), + spatial_sum_multiplier_.gpu_data(), 0., num_by_chans_.mutable_gpu_data()); caffe_gpu_gemv(CblasTrans, num, channels_, 1., num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0., - mean_.mutable_gpu_data()); - // reshape (broadcast) the above to make - // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1, - batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), - 0., num_by_chans_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num * channels_, - spatial_dim, 1, 1., num_by_chans_.gpu_data(), - spatial_sum_multiplier_.gpu_data(), 1., bottom_diff); - - // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y - caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, - Dtype(-1. / (num * spatial_dim)), bottom_diff); - - // note: temp_ still contains sqrt(var(X)+eps), computed during the forward - // pass. - caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), - bottom_diff); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - if (bottom[0] != top[0]) { - top_diff = top[0]->gpu_diff(); - } else { - greentea_copy(x_norm_.count(), (cl_mem) (top[0]->gpu_diff()), 0, - (cl_mem) (x_norm_.mutable_gpu_diff()), 0, &ctx); - top_diff = x_norm_.gpu_diff(); - } - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (use_global_stats_) { - greentea_gpu_div(this->device_->id(), temp_.count(), - (cl_mem) top_diff, 0, (cl_mem) (temp_.gpu_data()), - 0, (cl_mem) bottom_diff, 0); - return; - } - const Dtype* top_data = x_norm_.gpu_data(); - int_tp num = bottom[0]->shape()[0]; - int_tp spatial_dim = bottom[0]->count() / (channels_ * bottom[0]->shape(0)); - // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then - // - // dE(Y)/dX = - // (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y) - // ./ sqrt(var(X) + eps) - // - // where \cdot and ./ are hadamard product and elementwise division, - // respectively, dE/dY is the top diff, and mean/var/sum are all computed - // along all dimensions except the channels dimension. In the above - // equation, the operations allow for expansion (i.e. broadcast) along all - // dimensions except the channels dimension where required. - - // sum(dE/dY \cdot Y) - greentea_gpu_mul(this->device_->id(), temp_.count(), - (cl_mem) top_data, 0, (cl_mem) top_diff, 0, - (cl_mem) bottom_diff, 0); - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, channels_ * num, - spatial_dim, 1., (cl_mem) bottom_diff, 0, - (cl_mem) (spatial_sum_multiplier_.gpu_data()), 0, - 0., (cl_mem) (num_by_chans_.mutable_gpu_data()), - 0); - greentea_gpu_gemv(this->device_->id(), CblasTrans, num, channels_, - 1., (cl_mem) (num_by_chans_.gpu_data()), 0, - (cl_mem) (batch_sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (mean_.mutable_gpu_data()), 0); - - // reshape (broadcast) the above - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, CblasNoTrans, - num, channels_, 1, 1, - (cl_mem) (batch_sum_multiplier_.gpu_data()), 0, - (cl_mem) (mean_.gpu_data()), 0, 0., - (cl_mem) (num_by_chans_.mutable_gpu_data()), 0); - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, CblasNoTrans, - channels_ * num, spatial_dim, 1, 1., - (cl_mem) (num_by_chans_.gpu_data()), 0, - (cl_mem) (spatial_sum_multiplier_.gpu_data()), 0, - 0., (cl_mem) bottom_diff, 0); - - // sum(dE/dY \cdot Y) \cdot Y - greentea_gpu_mul(this->device_->id(), temp_.count(), - (cl_mem) top_data, 0, (cl_mem) bottom_diff, 0, - (cl_mem) bottom_diff, 0); + variance_.mutable_gpu_data()); // E((X_EX)^2) + + // compute and save moving average + this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_; + this->blobs_[2]->mutable_cpu_data()[0] += 1; + caffe_gpu_axpby(mean_.count(), Dtype(1), mean_.gpu_data(), + moving_average_fraction_, + this->blobs_[0]->mutable_gpu_data()); + int_tp m = bottom[0]->count() / channels_; + Dtype bias_correction_factor = m > 1 ? Dtype(m) / (m - 1) : 1; + caffe_gpu_axpby(variance_.count(), bias_correction_factor, + variance_.gpu_data(), moving_average_fraction_, + this->blobs_[1]->mutable_gpu_data()); + } - // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, channels_ * num, - spatial_dim, 1., (cl_mem) top_diff, 0, - (cl_mem) (spatial_sum_multiplier_.gpu_data()), 0, - 0., (cl_mem) (num_by_chans_.mutable_gpu_data()), - 0); - greentea_gpu_gemv(this->device_->id(), CblasTrans, num, channels_, - 1., (cl_mem) (num_by_chans_.gpu_data()), 0, - (cl_mem) (batch_sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (mean_.mutable_gpu_data()), 0); - // reshape (broadcast) the above to make - // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, CblasNoTrans, - num, channels_, 1, 1, - (cl_mem) (batch_sum_multiplier_.gpu_data()), 0, - (cl_mem) (mean_.gpu_data()), 0, 0., - (cl_mem) (num_by_chans_.mutable_gpu_data()), 0); - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, CblasNoTrans, - num * channels_, spatial_dim, 1, 1., - (cl_mem) (num_by_chans_.gpu_data()), 0, - (cl_mem) (spatial_sum_multiplier_.gpu_data()), 0, - 1., (cl_mem) bottom_diff, 0); + // normalize variance + caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); + caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), + variance_.mutable_gpu_data()); + + // replicate variance to input size + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1, + batch_sum_multiplier_.gpu_data(), + variance_.gpu_data(), 0., + num_by_chans_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, channels_ * num, + spatial_dim, 1, 1., num_by_chans_.gpu_data(), + spatial_sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); + // TODO(cdoersch): The caching is only needed because later in-place layers + // might clobber the data. Can we skip this if they won't? + caffe_copy(x_norm_.count(), top_data, x_norm_.mutable_gpu_data()); +} - // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y - greentea_gpu_axpby(this->device_->id(), temp_.count(), Dtype(1), - (cl_mem) top_diff, 0, - Dtype(-1. / (num * spatial_dim)), - (cl_mem) bottom_diff, 0); +template +void BatchNormLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + const Dtype* top_diff; - // note: temp_ still contains sqrt(var(X)+eps), computed during the forward - // pass. - greentea_gpu_div(this->device_->id(), temp_.count(), - (cl_mem) bottom_diff, 0, - (cl_mem) (temp_.gpu_data()), 0, - (cl_mem) bottom_diff, 0); -#endif // USE_GREENTEA + if (bottom[0] != top[0]) { + top_diff = top[0]->gpu_diff(); + } else { + caffe_copy(x_norm_.count(), top[0]->gpu_diff(), + x_norm_.mutable_gpu_diff()); + top_diff = x_norm_.gpu_diff(); + } + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + if (use_global_stats_) { + caffe_gpu_div(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff); + return; } + const Dtype* top_data = x_norm_.gpu_data(); + int_tp num = bottom[0]->shape()[0]; + int_tp spatial_dim = bottom[0]->count() / (channels_ * bottom[0]->shape(0)); + // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then + // + // dE(Y)/dX = + // (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y) + // ./ sqrt(var(X) + eps) + // + // where \cdot and ./ are hadamard product and elementwise division, + // respectively, dE/dY is the top diff, and mean/var/sum are all computed + // along all dimensions except the channels dimension. In the above + // equation, the operations allow for expansion (i.e. broadcast) along all + // dimensions except the channels dimension where required. + + // sum(dE/dY \cdot Y) + caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); + caffe_gpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, 1., + bottom_diff, spatial_sum_multiplier_.gpu_data(), 0., + num_by_chans_.mutable_gpu_data()); + caffe_gpu_gemv(CblasTrans, num, channels_, 1., + num_by_chans_.gpu_data(), + batch_sum_multiplier_.gpu_data(), 0., + mean_.mutable_gpu_data()); + + // reshape (broadcast) the above + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1, + batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), + 0., num_by_chans_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, channels_ * num, + spatial_dim, 1, 1., num_by_chans_.gpu_data(), + spatial_sum_multiplier_.gpu_data(), 0., bottom_diff); + + // sum(dE/dY \cdot Y) \cdot Y + caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); + + // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y + caffe_gpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, 1., + top_diff, spatial_sum_multiplier_.gpu_data(), 0., + num_by_chans_.mutable_gpu_data()); + caffe_gpu_gemv(CblasTrans, num, channels_, 1., + num_by_chans_.gpu_data(), + batch_sum_multiplier_.gpu_data(), 0., + mean_.mutable_gpu_data()); + // reshape (broadcast) the above to make + // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1, + batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), + 0., num_by_chans_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num * channels_, + spatial_dim, 1, 1., num_by_chans_.gpu_data(), + spatial_sum_multiplier_.gpu_data(), 1., bottom_diff); + + // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y + caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, + Dtype(-1. / (num * spatial_dim)), bottom_diff); + + // note: temp_ still contains sqrt(var(X)+eps), computed during the forward + // pass. + caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), + bottom_diff); } INSTANTIATE_LAYER_GPU_FUNCS(BatchNormLayer); diff --git a/src/caffe/layers/batch_reindex_layer.cu b/src/caffe/layers/batch_reindex_layer.cu index 137f27d089f..9a546e57d27 100644 --- a/src/caffe/layers/batch_reindex_layer.cu +++ b/src/caffe/layers/batch_reindex_layer.cu @@ -28,8 +28,8 @@ void BatchReindexLayer::Forward_gpu(const vector*>& bottom, return; } if (this->device_->backend() == BACKEND_CUDA) { - int_tp threads = top[0]->count(); #ifdef USE_CUDA + int_tp threads = top[0]->count(); // NOLINT_NEXT_LINE(whitespace/operators) BRForward CUDA_KERNEL(CAFFE_GET_BLOCKS(threads), CAFFE_CUDA_NUM_THREADS) ( @@ -46,11 +46,18 @@ void BatchReindexLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::kernel &oclk_br = program.get_kernel( CL_KERNEL_SELECT("br_forward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom0 = clState.get_buffer_mem(bottom[0]->gpu_data()); + ClMemOff buf_bottom1 = clState.get_buffer_mem(bottom[1]->gpu_data()); + ClMemOff buf_top0 = + clState.get_buffer_mem(top[0]->mutable_gpu_data()); + viennacl::ocl::enqueue( oclk_br(top[0]->count(), bottom[0]->count() / bottom[0]->shape(0), - WrapHandle((cl_mem) (bottom[0]->gpu_data()), &ctx), - WrapHandle((cl_mem) (bottom[1]->gpu_data()), &ctx), - WrapHandle((cl_mem) (top[0]->mutable_gpu_data()), &ctx)), + WrapHandle(buf_bottom0.memobj, &ctx), + WrapHandle(buf_bottom1.memobj, &ctx), + WrapHandle(buf_top0.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -136,13 +143,23 @@ void BatchReindexLayer::Backward_gpu( viennacl::ocl::kernel &oclk_br = program.get_kernel( CL_KERNEL_SELECT("br_backward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_top0 = clState.get_buffer_mem(top[0]->gpu_diff()); + ClMemOff buf_top_indexes = + clState.get_buffer_mem(top_indexes.gpu_data()); + ClMemOff buf_begins = clState.get_buffer_mem(begins.gpu_data()); + ClMemOff buf_counts = clState.get_buffer_mem(counts.gpu_data()); + ClMemOff buf_bottom0 = + clState.get_buffer_mem(bottom[0]->mutable_gpu_diff()); + viennacl::ocl::enqueue( oclk_br(bottom[0]->count(), bottom[0]->count() / bottom[0]->shape(0), - WrapHandle((cl_mem)(top[0]->gpu_diff()), &ctx), - WrapHandle((cl_mem)(top_indexes.gpu_data()), &ctx), - WrapHandle((cl_mem)(begins.gpu_data()), &ctx), - WrapHandle((cl_mem)(counts.gpu_data()), &ctx), - WrapHandle((cl_mem)(bottom[0]->mutable_gpu_diff()), &ctx)), + WrapHandle(buf_top0.memobj, &ctx), + WrapHandle(buf_top_indexes.memobj, &ctx), + WrapHandle(buf_begins.memobj, &ctx), + WrapHandle(buf_counts.memobj, &ctx), + WrapHandle(buf_bottom0.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/layers/bias_layer.cu b/src/caffe/layers/bias_layer.cu index 7ce6fd5db85..1c86ccd8e4b 100644 --- a/src/caffe/layers/bias_layer.cu +++ b/src/caffe/layers/bias_layer.cu @@ -7,7 +7,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -46,10 +45,15 @@ void BiasLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::program &program = this->device_->program(); viennacl::ocl::kernel &oclk_bias_forward = program.get_kernel( CL_KERNEL_SELECT("bias_forward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + ClMemOff buf_bias = clState.get_buffer_mem(bias_data); + viennacl::ocl::enqueue( - oclk_bias_forward(count, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) bias_data, &ctx), bias_dim_, - inner_dim_, WrapHandle((cl_mem) top_data, &ctx)), + oclk_bias_forward(count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_bias.memobj, &ctx), bias_dim_, + inner_dim_, WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -59,9 +63,7 @@ template void BiasLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (propagate_down[0] && bottom[0] != top[0]) { + if (propagate_down[0] && bottom[0] != top[0]) { const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); caffe_copy(bottom[0]->count(), top_diff, bottom_diff); @@ -82,39 +84,6 @@ void BiasLayer::Backward_gpu(const vector*>& top, accum = true; } } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - if (propagate_down[0] && bottom[0] != top[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - greentea_copy(bottom[0]->count(), (cl_mem) top_diff, 0, - (cl_mem) bottom_diff, 0, &ctx); - } - // in-place, we don't need to do anything with the data diff - const bool bias_param = (bottom.size() == 1); - if ((!bias_param && propagate_down[1]) - || (bias_param && this->param_propagate_down_[0])) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bias_diff = (bias_param ? this->blobs_[0].get() : bottom[1]) - ->mutable_gpu_diff(); - bool accum = bias_param; - - int_tp top_diff_off = 0; - for (int_tp n = 0; n < outer_dim_; ++n) { - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, bias_dim_, - inner_dim_, Dtype(1), (cl_mem) top_diff, top_diff_off, - (cl_mem) (bias_multiplier_.gpu_data()), 0, - Dtype(accum), (cl_mem) bias_diff, 0); - top_diff_off += dim_; - accum = true; - } - } -#endif // USE_GREENTEA - } } INSTANTIATE_LAYER_GPU_FUNCS(BiasLayer); diff --git a/src/caffe/layers/bnll_layer.cu b/src/caffe/layers/bnll_layer.cu index c121497f7b0..090a3e716ce 100644 --- a/src/caffe/layers/bnll_layer.cu +++ b/src/caffe/layers/bnll_layer.cu @@ -5,7 +5,7 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/math_functions.hpp" #endif namespace caffe { @@ -44,9 +44,13 @@ void BNLLLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::kernel &oclk_bnll = program.get_kernel( CL_KERNEL_SELECT("bnll_forward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_bnll(count, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) top_data, &ctx)), + oclk_bnll(count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -89,10 +93,16 @@ void BNLLLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_bnll = program.get_kernel( CL_KERNEL_SELECT("bnll_backward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_bottomdata = clState.get_buffer_mem(bottom_data); + ClMemOff buf_bottomdiff = clState.get_buffer_mem(bottom_diff); + viennacl::ocl::enqueue( - oclk_bnll(count, WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) bottom_diff, &ctx)), + oclk_bnll(count, WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_bottomdata.memobj, &ctx), + WrapHandle(buf_bottomdiff.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu index a258c795ba4..fcaebb8ef98 100644 --- a/src/caffe/layers/concat_layer.cu +++ b/src/caffe/layers/concat_layer.cu @@ -5,7 +5,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -63,11 +62,15 @@ void ConcatLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::kernel &oclk_concat = program.get_kernel( CL_KERNEL_SELECT("concat")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_concat(nthreads, WrapHandle((cl_mem) bottom_data, &ctx), + oclk_concat(nthreads, WrapHandle(buf_bottom.memobj, &ctx), kForward ? 1 : 0, num_concats_, concat_input_size_, top_concat_axis, bottom_concat_axis, offset_concat_axis, - WrapHandle((cl_mem) top_data, &ctx)), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -109,11 +112,15 @@ void ConcatLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_concat = program.get_kernel( CL_KERNEL_SELECT("concat")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + viennacl::ocl::enqueue( - oclk_concat(nthreads, WrapHandle((cl_mem) top_diff, &ctx), + oclk_concat(nthreads, WrapHandle(buf_top.memobj, &ctx), kForward ? 1 : 0, num_concats_, concat_input_size_, top_concat_axis, bottom_concat_axis, offset_concat_axis, - WrapHandle((cl_mem) bottom_diff, &ctx)), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/layers/contrastive_loss_layer.cu b/src/caffe/layers/contrastive_loss_layer.cu index e6df1b6c194..8593ffb47f1 100644 --- a/src/caffe/layers/contrastive_loss_layer.cu +++ b/src/caffe/layers/contrastive_loss_layer.cu @@ -6,7 +6,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -19,38 +18,16 @@ void ContrastiveLossLayer::Forward_gpu( const int_tp count = bottom[0]->count(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_sub(count, bottom[0]->gpu_data(), // a - bottom[1]->gpu_data(), // b - diff_.mutable_gpu_data()); // a_i-b_i - caffe_gpu_powx(count, diff_.mutable_gpu_data(), // a_i-b_i - Dtype(2), diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 - caffe_gpu_gemv(CblasNoTrans, bottom[0]->num(), bottom[0]->channels(), - Dtype(1.0), - diff_sq_.gpu_data(), // (a_i-b_i)^2 - summer_vec_.gpu_data(), Dtype(0.0), - dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_sub(this->device_->id(), count, - (cl_mem) (bottom[0]->gpu_data()), 0, - (cl_mem) (bottom[1]->gpu_data()), 0, - (cl_mem) (diff_.mutable_gpu_data()), 0); - greentea_gpu_powx(this->device_->id(), count, - (cl_mem) (diff_.mutable_gpu_data()), - 0, // a_i-b_i - Dtype(2), (cl_mem) (diff_sq_.mutable_gpu_data()), - 0); // (a_i-b_i)^2 - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, - bottom[0]->num(), bottom[0]->channels(), - Dtype(1.0), (cl_mem) (diff_sq_.gpu_data()), - 0, // (a_i-b_i)^2 - (cl_mem) (summer_vec_.gpu_data()), 0, Dtype(0.0), - (cl_mem) (dist_sq_.mutable_gpu_data()), 0); -#endif // USE_GREENTEA - } + caffe_gpu_sub(count, bottom[0]->gpu_data(), // a + bottom[1]->gpu_data(), // b + diff_.mutable_gpu_data()); // a_i-b_i + caffe_gpu_powx(count, diff_.mutable_gpu_data(), // a_i-b_i + Dtype(2), diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 + caffe_gpu_gemv(CblasNoTrans, bottom[0]->num(), bottom[0]->channels(), + Dtype(1.0), + diff_sq_.gpu_data(), // (a_i-b_i)^2 + summer_vec_.gpu_data(), Dtype(0.0), + dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 Dtype margin = this->layer_param_.contrastive_loss_param().margin(); Dtype loss(0.0); @@ -139,15 +116,24 @@ void ContrastiveLossLayer::Backward_gpu( viennacl::ocl::kernel &oclk_cll = program.get_kernel( legacy_version ? CL_KERNEL_SELECT("cll_backward_legacy") : CL_KERNEL_SELECT("cll_backward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom2 = + clState.get_buffer_mem(bottom[2]->gpu_data()); + ClMemOff buf_diff = + clState.get_buffer_mem(diff_.gpu_data()); + ClMemOff buf_dist_sq = + clState.get_buffer_mem(dist_sq_.gpu_data()); + ClMemOff buf_bottomi = + clState.get_buffer_mem(bottom[i]->mutable_gpu_diff()); + viennacl::ocl::enqueue( oclk_cll( count, channels, margin, alpha, - WrapHandle((cl_mem) (bottom[2]->gpu_data()), &ctx), - WrapHandle((cl_mem) (diff_.gpu_data()), &ctx), - WrapHandle((cl_mem) (dist_sq_.gpu_data()), &ctx), - WrapHandle((cl_mem) (bottom[i]->mutable_gpu_diff()), &ctx)), + WrapHandle(buf_bottom2.memobj, &ctx), + WrapHandle(buf_diff.memobj, &ctx), + WrapHandle(buf_dist_sq.memobj, &ctx), + WrapHandle(buf_bottomi.memobj, &ctx)), ctx.get_queue()); - #endif // USE_GREENTEA } } diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu index c0a50f18fd5..0d62919367e 100644 --- a/src/caffe/layers/conv_layer.cu +++ b/src/caffe/layers/conv_layer.cu @@ -4,8 +4,8 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_im2col.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/im2col.hpp" +#include "caffe/util/math_functions.hpp" #endif namespace caffe { diff --git a/src/caffe/layers/conv_layer_fft.cu b/src/caffe/layers/conv_layer_fft.cu index 60e1233502b..c121762ebf5 100644 --- a/src/caffe/layers/conv_layer_fft.cu +++ b/src/caffe/layers/conv_layer_fft.cu @@ -10,8 +10,8 @@ #include "caffe/layers/conv_fft_layer.hpp" #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_im2col.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/im2col.hpp" +#include "caffe/util/math_functions.hpp" // #define COMPLEX_MULT_CONJ_1D @@ -114,16 +114,17 @@ void ConvolutionLayerFFT::fft_gpu_setup() { template void ConvolutionLayerFFT::fft_gpu_clean() { if (fft_gpu_initialized_) { - clReleaseMemObject((cl_mem)fft_gpu_weights_complex_); + ClState& clState = Caffe::cl_state(); + clState.destroy_buffer(fft_gpu_weights_complex_); + clState.destroy_buffer(fft_gpu_map_in_real_all_channels_); + clState.destroy_buffer(fft_gpu_map_in_complex_all_channels_); + clState.destroy_buffer(fft_gpu_map_in_real_all_num_output_); + clState.destroy_buffer(fft_gpu_map_in_complex_all_num_output_); + clState.destroy_buffer(fft_gpu_map_out_complex_); + clState.destroy_buffer(fft_gpu_map_out_real_); #ifdef COMPLEX_NULT_CONJ_RESHAPE clReleaseMemObject(fft_gpu_weights_complex_reshape_); #endif - clReleaseMemObject((cl_mem)fft_gpu_map_in_real_all_channels_); - clReleaseMemObject((cl_mem)fft_gpu_map_in_complex_all_channels_); - clReleaseMemObject((cl_mem)fft_gpu_map_in_real_all_num_output_); - clReleaseMemObject((cl_mem)fft_gpu_map_in_complex_all_num_output_); - clReleaseMemObject((cl_mem)fft_gpu_map_out_complex_); - clReleaseMemObject((cl_mem)fft_gpu_map_out_real_); } fft_gpu_initialized_ = false; } @@ -411,12 +412,11 @@ void ConvolutionLayerFFT::Backward_gpu( Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); if (this->param_propagate_down_[0]) { - greentea_gpu_set(this->device_->id(), this->blobs_[0]->count(), Dtype(0), - (cl_mem)weight_diff, Dtype(0)); + caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); } if (this->bias_term_ && this->param_propagate_down_[1]) { - greentea_gpu_set(this->device_->id(), this->blobs_[1]->count(), Dtype(0), - (cl_mem)this->blobs_[1]->mutable_gpu_diff(), Dtype(0)); + caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), + this->blobs_[1]->mutable_gpu_diff()); } diff --git a/src/caffe/layers/conv_layer_spatial.cu b/src/caffe/layers/conv_layer_spatial.cu index 1f17496b365..e67648529aa 100644 --- a/src/caffe/layers/conv_layer_spatial.cu +++ b/src/caffe/layers/conv_layer_spatial.cu @@ -12,8 +12,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/cl_kernels.hpp" #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_im2col.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif #include @@ -317,8 +315,13 @@ void ConvolutionLayerSpatial::swizzleWeights( cl_uint argIdx = 0; int_tp channels = this->channels_ / this->group_; - oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx)); - oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) swizzled_weights, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_weight = clState.get_buffer_mem(weight); + ClMemOff buf_swizzled = clState.get_buffer_mem(swizzled_weights); + + oclk_copy_weight.arg(argIdx++, WrapHandle(buf_weight.memobj, &ctx)); + oclk_copy_weight.arg(argIdx++, WrapHandle(buf_swizzled.memobj, &ctx)); oclk_copy_weight.arg(argIdx++, kernel_w_); oclk_copy_weight.arg(argIdx++, kernel_h_); oclk_copy_weight.arg(argIdx++, channels); @@ -367,7 +370,11 @@ void ConvolutionLayerSpatial::pad_image( int_tp col_data_offset = 0; int_tp channels = this->channels_; - oclk_copy.arg(argIdx++, WrapHandle((cl_mem) bottom_data, &ctx)); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_col = clState.get_buffer_mem(col_data); + + oclk_copy.arg(argIdx++, WrapHandle(buf_bottom.memobj, &ctx)); oclk_copy.arg(argIdx++, image_offset); oclk_copy.arg(argIdx++, channels); oclk_copy.arg(argIdx++, height_); @@ -376,7 +383,7 @@ void ConvolutionLayerSpatial::pad_image( oclk_copy.arg(argIdx++, padded_width_); oclk_copy.arg(argIdx++, pad_h_); oclk_copy.arg(argIdx++, pad_w_); - oclk_copy.arg(argIdx++, WrapHandle((cl_mem) col_data, &ctx)); + oclk_copy.arg(argIdx++, WrapHandle(buf_col.memobj, &ctx)); oclk_copy.arg(argIdx++, col_data_offset); oclk_copy.arg(argIdx++, imgNum); const size_t global_work_size_Copy[3] = { (size_t) padded_width_, @@ -474,23 +481,32 @@ cl_int ConvolutionLayerSpatial::convolve( * g; // Copy image + ClState& clState = Caffe::cl_state(); + ClMemOff buf_col = clState.get_buffer_mem(col_data); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_swizzled = clState.get_buffer_mem(swizzled_weights); + ClMemOff buf_weight = clState.get_buffer_mem(weight); + ClMemOff buf_bias = clState.get_buffer_mem(bias_); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + if (pad_w_ > 0 || pad_h_ > 0) { pad_image(bottom, top, image_offset, config, numImages); image_offset = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem) col_data, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_col.memobj, &ctx)); } else { - kernel.arg(argIdx++, WrapHandle((cl_mem) bottom_data, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_bottom.memobj, &ctx)); } kernel.arg(argIdx++, image_offset); if (config->swizzle_weights) - kernel.arg(argIdx++, WrapHandle((cl_mem) swizzled_weights, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_swizzled.memobj, &ctx)); else - kernel.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_weight.memobj, &ctx)); kernel.arg(argIdx++, kernel_offset); - kernel.arg(argIdx++, WrapHandle((cl_mem) bias_, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_bias.memobj, &ctx)); kernel.arg(argIdx++, bias_offset_); - kernel.arg(argIdx++, WrapHandle((cl_mem) top_data, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_top.memobj, &ctx)); kernel.arg(argIdx++, output_image_offset); + if (config->use_null_local) { err = clEnqueueNDRangeKernel(ctx.get_queue().handle().get(), kernel.handle().get(), 3, @@ -541,18 +557,27 @@ cl_int ConvolutionLayerSpatial::batched_convolve( * g; pad_image(bottom, top, image_offset, config, numImages); - kernel.arg(argIdx++, WrapHandle((cl_mem) col_data, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_col = clState.get_buffer_mem(col_data); + ClMemOff buf_swizzled = clState.get_buffer_mem(swizzled_weights); + ClMemOff buf_weight = clState.get_buffer_mem(weight); + ClMemOff buf_bias = clState.get_buffer_mem(bias_); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + + kernel.arg(argIdx++, WrapHandle(buf_col.memobj, &ctx)); kernel.arg(argIdx++, image_offset); if (config->swizzle_weights) - kernel.arg(argIdx++, WrapHandle((cl_mem) swizzled_weights, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_swizzled.memobj, &ctx)); else - kernel.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_weight.memobj, &ctx)); kernel.arg(argIdx++, kernel_offset); - kernel.arg(argIdx++, WrapHandle((cl_mem) bias_, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_bias.memobj, &ctx)); kernel.arg(argIdx++, bias_offset_); - kernel.arg(argIdx++, WrapHandle((cl_mem) top_data, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_top.memobj, &ctx)); kernel.arg(argIdx++, output_image_offset); kernel.arg(argIdx++, numImages); + if (config->use_null_local) { err = clEnqueueNDRangeKernel(ctx.get_queue().handle().get(), kernel.handle().get(), 3, diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu index 79006c1fa85..5c07f928f37 100644 --- a/src/caffe/layers/crop_layer.cu +++ b/src/caffe/layers/crop_layer.cu @@ -129,24 +129,34 @@ void CropLayer::crop_copy_gpu(const vector*>& bottom, const int_tp bottom_off = bottom[0]->offset(ind_off); Dtype* top_data = top[0]->mutable_gpu_data(); const int_tp top_off = top[0]->offset(indices); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( oclk_copy_crop(lines, height, width, src_outer_stride, src_inner_stride, dest_outer_stride, dest_inner_stride, - WrapHandle((cl_mem) bottom_data, &ctx), bottom_off, - WrapHandle((cl_mem) top_data, &ctx), top_off), + WrapHandle(buf_bottom.memobj, &ctx), bottom_off, + WrapHandle(buf_top.memobj, &ctx), top_off), ctx.get_queue()); } else { const Dtype* top_diff = top[0]->gpu_diff(); const int_tp top_off = top[0]->offset(indices); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); const int_tp bottom_off = bottom[0]->offset(ind_off); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + viennacl::ocl::enqueue( oclk_copy_crop(lines, height, width, dest_outer_stride, dest_inner_stride, src_outer_stride, src_inner_stride, - WrapHandle((cl_mem) top_diff, &ctx), top_off, - WrapHandle((cl_mem) bottom_diff, &ctx), bottom_off), + WrapHandle(buf_top.memobj, &ctx), top_off, + WrapHandle(buf_bottom.memobj, &ctx), bottom_off), ctx.get_queue()); } } @@ -171,16 +181,8 @@ void CropLayer::Backward_gpu(const vector*>& top, Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); if (propagate_down[0]) { - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_set(bottom[0]->count(), static_cast(0), bottom_diff); -#endif - } else { -#ifdef USE_GREENTEA - greentea_gpu_set(this->device_->id(), bottom[0]->count(), - static_cast(0), (cl_mem) bottom_diff, 0); -#endif - } + caffe_gpu_set(bottom[0]->count(), static_cast(0), bottom_diff); + std::vector indices(top[0]->num_axes(), 0); crop_copy_gpu(bottom, top, offsets, indices, 0, top_diff, bottom_diff, false); diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu index 725d2f5b107..b54f0ab3446 100644 --- a/src/caffe/layers/deconv_layer.cu +++ b/src/caffe/layers/deconv_layer.cu @@ -4,8 +4,8 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_im2col.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/im2col.hpp" +#include "caffe/util/math_functions.hpp" #endif namespace caffe { diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu index aba3c790826..0d88670f84d 100644 --- a/src/caffe/layers/dropout_layer.cu +++ b/src/caffe/layers/dropout_layer.cu @@ -46,20 +46,27 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::context &ctx = viennacl::ocl::get_context( this->device_->id()); viennacl::ocl::program &program = this->device_->program(); + if (this->phase_ == TRAIN) { - cl_mem mask = (cl_mem) (rand_vec_.mutable_gpu_data()); - greentea_gpu_rng_uniform(this->device_->id(), count, mask, 0); + uint_tp* mask = + static_cast(rand_vec_.mutable_gpu_data()); + caffe_gpu_rng_uniform(count, reinterpret_cast (mask)); // set thresholds viennacl::ocl::kernel &oclk_dropout = program.get_kernel( CL_KERNEL_SELECT("dropout_forward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_mask = clState.get_buffer_mem(mask); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_dropout(count, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle(mask, &ctx), uint_thres_, scale_, - WrapHandle((cl_mem) top_data, &ctx)), + oclk_dropout(count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_mask.memobj, &ctx), uint_thres_, scale_, + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); } else { - greentea_copy(count, (cl_mem) bottom_data, 0, (cl_mem) top_data, 0, - &ctx); + caffe_copy(count, bottom_data, top_data); } #endif // USE_GREENTEA } @@ -107,18 +114,24 @@ void DropoutLayer::Backward_gpu(const vector*>& top, viennacl::ocl::program &program = this->device_->program(); if (this->phase_ == TRAIN) { - cl_mem mask = (cl_mem) (rand_vec_.gpu_data()); + const uint_tp* mask = static_cast(rand_vec_ + .gpu_data()); const int_tp count = bottom[0]->count(); viennacl::ocl::kernel &oclk_dropout = program.get_kernel( CL_KERNEL_SELECT("dropout_backward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + ClMemOff buf_mask = clState.get_buffer_mem(mask); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + viennacl::ocl::enqueue( - oclk_dropout(count, WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle(mask, &ctx), uint_thres_, scale_, - WrapHandle((cl_mem) bottom_diff, &ctx)), + oclk_dropout(count, WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_mask.memobj, &ctx), uint_thres_, scale_, + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); } else { - greentea_copy(top[0]->count(), (cl_mem) top_diff, 0, - (cl_mem) bottom_diff, 0, &ctx); + caffe_copy(top[0]->count(), top_diff, bottom_diff); } #endif // USE_GREENTEA } diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu index a2688bbbbd0..8fefb1194a7 100644 --- a/src/caffe/layers/eltwise_layer.cu +++ b/src/caffe/layers/eltwise_layer.cu @@ -6,7 +6,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -87,50 +86,52 @@ void EltwiseLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::program &program = this->device_->program(); switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: { - greentea_gpu_mul(this->device_->id(), - count, (cl_mem)(bottom[0]->gpu_data()), 0, - (cl_mem)(bottom[1]->gpu_data()), 0, - (cl_mem)top_data, 0); + case EltwiseParameter_EltwiseOp_PROD: + caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), + top_data); for (int_tp i = 2; i < bottom.size(); ++i) { - greentea_gpu_mul(this->device_->id(), - count, (cl_mem)top_data, 0, - (cl_mem)(bottom[i]->gpu_data()), 0, - (cl_mem)top_data, 0); + caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); } - } - break; - case EltwiseParameter_EltwiseOp_SUM: { - greentea_gpu_set(this->device_->id(), count, 0, - (cl_mem)top_data, 0); + break; + case EltwiseParameter_EltwiseOp_SUM: + caffe_gpu_set(count, Dtype(0.), top_data); + // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? for (int_tp i = 0; i < bottom.size(); ++i) { - greentea_gpu_axpy(this->device_->id(), - count, coeffs_[i], - (cl_mem)(bottom[i]->gpu_data()), - 0, (cl_mem)top_data, 0); + caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); } - } - break; + break; case EltwiseParameter_EltwiseOp_MAX: { mask = max_idx_.mutable_gpu_data(); viennacl::ocl::kernel &oclk_max_forward = program.get_kernel( CL_KERNEL_SELECT("eltwise_max_forward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom0 = + clState.get_buffer_mem(bottom[0]->gpu_data()); + ClMemOff buf_bottom1 = + clState.get_buffer_mem(bottom[1]->gpu_data()); + ClMemOff buf_top = + clState.get_buffer_mem(top_data); + ClMemOff buf_mask = + clState.get_buffer_mem(mask); + viennacl::ocl::enqueue( oclk_max_forward(count, - WrapHandle((cl_mem)(bottom[0]->gpu_data()), &ctx), - WrapHandle((cl_mem)(bottom[1]->gpu_data()), &ctx), (int_tp)0, - WrapHandle((cl_mem)top_data, &ctx), - WrapHandle((cl_mem)mask, &ctx)), + WrapHandle(buf_bottom0.memobj, &ctx), + WrapHandle(buf_bottom1.memobj, &ctx), (int_tp)0, + WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_mask.memobj, &ctx)), ctx.get_queue()); for (int_tp i = 2; i < bottom.size(); ++i) { + ClMemOff buf_bottomi = + clState.get_buffer_mem(bottom[i]->gpu_data()); viennacl::ocl::enqueue( - oclk_max_forward(count, WrapHandle((cl_mem)(top_data), &ctx), - WrapHandle((cl_mem)(bottom[i]->gpu_data()), &ctx), i-1, - WrapHandle((cl_mem)top_data, &ctx), - WrapHandle((cl_mem)mask, &ctx)), + oclk_max_forward(count, WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_bottomi.memobj, &ctx), i-1, + WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_mask.memobj, &ctx)), ctx.get_queue()); } } @@ -225,7 +226,7 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, const Dtype* bottom_data = bottom[i]->gpu_data(); Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: { + case EltwiseParameter_EltwiseOp_PROD: if (stable_prod_grad_) { bool initialized = false; for (int_tp j = 0; j < bottom.size(); ++j) { @@ -233,51 +234,43 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, continue; } if (!initialized) { - greentea_copy(count, - (cl_mem)(bottom[j]->gpu_data()), 0, - (cl_mem)(bottom_diff), 0, &ctx); + caffe_copy(count, bottom[j]->gpu_data(), bottom_diff); initialized = true; } else { - greentea_gpu_mul(this->device_->id(), count, - (cl_mem)bottom[j]->gpu_data(), 0, - (cl_mem)bottom_diff, 0, - (cl_mem)bottom_diff, 0); + caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, + bottom_diff); } } } else { - greentea_gpu_div(this->device_->id(), - count, (cl_mem)top_data, 0, - (cl_mem)bottom_data, 0, (cl_mem)bottom_diff, 0); + caffe_gpu_div(count, top_data, bottom_data, bottom_diff); } - greentea_gpu_mul(this->device_->id(), - count, (cl_mem)bottom_diff, 0, - (cl_mem)top_diff, 0, (cl_mem)bottom_diff, 0); - } - break; - case EltwiseParameter_EltwiseOp_SUM: { + caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); + break; + case EltwiseParameter_EltwiseOp_SUM: if (coeffs_[i] == Dtype(1.)) { - greentea_copy(count, (cl_mem)top_diff, - 0, (cl_mem)bottom_diff, 0, &ctx); + caffe_copy(count, top_diff, bottom_diff); } else { - greentea_gpu_scale(this->device_->id(), - count, coeffs_[i], (cl_mem)top_diff, - 0, (cl_mem)bottom_diff, 0); + caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); } - } - break; + break; case EltwiseParameter_EltwiseOp_MAX: { mask = max_idx_.gpu_data(); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_mask = clState.get_buffer_mem(mask); + viennacl::ocl::kernel &oclk_max_backward = program.get_kernel( CL_KERNEL_SELECT("eltwise_max_backward")); viennacl::ocl::enqueue( - oclk_max_backward(count, WrapHandle((cl_mem)top_diff, &ctx), i, - WrapHandle((cl_mem)mask, &ctx), - WrapHandle((cl_mem)bottom_diff, &ctx)), + oclk_max_backward(count, WrapHandle(buf_top.memobj, &ctx), i, + WrapHandle(buf_mask.memobj, &ctx), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); - } - break; + } + break; default: { LOG(FATAL)<< "Unknown elementwise operation."; } diff --git a/src/caffe/layers/elu_layer.cu b/src/caffe/layers/elu_layer.cu index 0b57cf83379..7c77379f58d 100644 --- a/src/caffe/layers/elu_layer.cu +++ b/src/caffe/layers/elu_layer.cu @@ -40,9 +40,14 @@ void ELULayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::kernel &oclk_elu = program.get_kernel( CL_KERNEL_SELECT("elu_forward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_elu(count, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) top_data, &ctx), alpha), + oclk_elu(count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx), alpha), ctx.get_queue()); #endif // USE_GREENTEA } @@ -88,11 +93,17 @@ void ELULayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_elu = program.get_kernel( CL_KERNEL_SELECT("elu_backward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom_diff = clState.get_buffer_mem(bottom_diff); + ClMemOff buf_top_diff = clState.get_buffer_mem(top_diff); + ClMemOff buf_bottom_data = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top_data = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_elu(count, WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) top_data, &ctx), - WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) bottom_diff, &ctx), alpha), + oclk_elu(count, WrapHandle(buf_top_diff.memobj, &ctx), + WrapHandle(buf_top_data.memobj, &ctx), + WrapHandle(buf_bottom_data.memobj, &ctx), + WrapHandle(buf_bottom_diff.memobj, &ctx), alpha), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu index 7d479a0ec67..8baafc7d30e 100644 --- a/src/caffe/layers/embed_layer.cu +++ b/src/caffe/layers/embed_layer.cu @@ -9,7 +9,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif @@ -70,18 +69,22 @@ void EmbedLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::kernel &oclk_embed = program.get_kernel( CL_KERNEL_SELECT("embed_forward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_weight = clState.get_buffer_mem(weight); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_embed(count, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) weight, &ctx), M_, N_, K_, - WrapHandle((cl_mem) top_data, &ctx)), + oclk_embed(count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_weight.memobj, &ctx), M_, N_, K_, + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); if (bias_term_) { - greentea_gpu_gemm(this->get_device()->id(), CblasNoTrans, - CblasNoTrans, M_, N_, 1, Dtype(1), - (cl_mem) (bias_multiplier_.gpu_data()), 0, - (cl_mem) (this->blobs_[1]->gpu_data()), 0, - Dtype(1), (cl_mem) top_data, 0); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, Dtype(1), + bias_multiplier_.gpu_data(), + this->blobs_[1]->gpu_data(), Dtype(1), top_data); } #endif // USE_GREENTEA @@ -111,10 +114,16 @@ void EmbedLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_embed = program.get_kernel( CL_KERNEL_SELECT("embed_backward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_weight = clState.get_buffer_mem(weight_diff); + viennacl::ocl::enqueue( - oclk_embed(top_count, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) top_diff, &ctx), M_, N_, K_, - WrapHandle((cl_mem) weight_diff, &ctx)), + oclk_embed(top_count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx), M_, N_, K_, + WrapHandle(buf_weight.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -122,19 +131,8 @@ void EmbedLayer::Backward_gpu(const vector*>& top, if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - if (this->get_device()->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA caffe_gpu_gemv(CblasTrans, M_, N_, Dtype(1), top_diff, bias_multiplier_.gpu_data(), Dtype(1), bias_diff); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_gemv(this->get_device()->id(), CblasTrans, M_, N_, - Dtype(1), (cl_mem) top_diff, 0, - (cl_mem) (bias_multiplier_.gpu_data()), 0, - Dtype(1), (cl_mem) bias_diff, 0); -#endif // USE_GREENTEA - } } } diff --git a/src/caffe/layers/euclidean_loss_layer.cu b/src/caffe/layers/euclidean_loss_layer.cu index 07b1e7fda8e..54034f3435f 100644 --- a/src/caffe/layers/euclidean_loss_layer.cu +++ b/src/caffe/layers/euclidean_loss_layer.cu @@ -5,7 +5,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -16,35 +15,14 @@ void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, int_tp count = bottom[0]->count(); Dtype dot; - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_sub(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), - diff_.mutable_gpu_data()); - // Scale the error element-wise - if (bottom.size() == 3) { - caffe_gpu_mul(count, diff_.mutable_gpu_data(), - bottom[2]->gpu_data(), diff_.mutable_gpu_data()); - } - caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_sub(this->device_->id(), count, - (cl_mem) (bottom[0]->gpu_data()), 0, - (cl_mem) (bottom[1]->gpu_data()), 0, - (cl_mem) (diff_.mutable_gpu_data()), 0); - // Scale the error element-wise - if (bottom.size() == 3) { - greentea_gpu_mul(this->device_->id(), count, - (cl_mem) (diff_.mutable_gpu_data()), 0, - (cl_mem) (bottom[2]->gpu_data()), 0, - (cl_mem) (diff_.mutable_gpu_data()), 0); - } - greentea_gpu_dot(this->device_->id(), count, - (cl_mem) (diff_.gpu_data()), 0, - (cl_mem) (diff_.gpu_data()), 0, &dot); -#endif // USE_GREENTEA + caffe_gpu_sub(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), + diff_.mutable_gpu_data()); + // Scale the error element-wise + if (bottom.size() == 3) { + caffe_gpu_mul(count, diff_.mutable_gpu_data(), + bottom[2]->gpu_data(), diff_.mutable_gpu_data()); } + caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); Dtype loss = dot / static_cast(bottom[0]->count(0)) / Dtype(2); top[0]->mutable_cpu_data()[0] = loss; } @@ -58,21 +36,12 @@ void EuclideanLossLayer::Backward_gpu( const Dtype sign = (i == 0) ? 1 : -1; const Dtype alpha = sign * top[0]->cpu_diff()[0] / static_cast(bottom[0]->count(0)); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_axpby(bottom[i]->count(), // count - alpha, // alpha - diff_.gpu_data(), // a - Dtype(0), // beta - bottom[i]->mutable_gpu_diff()); // b -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_axpby(this->device_->id(), bottom[i]->count(), alpha, - (cl_mem) (diff_.gpu_data()), 0, Dtype(0), - (cl_mem) (bottom[i]->mutable_gpu_diff()), 0); -#endif // USE_GREENTEA - } + + caffe_gpu_axpby(bottom[i]->count(), // count + alpha, // alpha + diff_.gpu_data(), // a + Dtype(0), // beta + bottom[i]->mutable_gpu_diff()); // b } } } diff --git a/src/caffe/layers/exp_layer.cu b/src/caffe/layers/exp_layer.cu index baf1e3f70d5..1144e643c92 100644 --- a/src/caffe/layers/exp_layer.cu +++ b/src/caffe/layers/exp_layer.cu @@ -12,36 +12,14 @@ void ExpLayer::Forward_gpu(const vector*>& bottom, const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (inner_scale_ == Dtype(1)) { - caffe_gpu_exp(count, bottom_data, top_data); - } else { - caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); - caffe_gpu_exp(count, top_data, top_data); - } - if (outer_scale_ != Dtype(1)) { - caffe_gpu_scal(count, outer_scale_, top_data); - } -#endif // USE_CUDA + if (inner_scale_ == Dtype(1)) { + caffe_gpu_exp(count, bottom_data, top_data); } else { -#ifdef USE_GREENTEA - if (inner_scale_ == Dtype(1)) { - greentea_gpu_exp(this->device_->id(), count, - (cl_mem) bottom_data, 0, (cl_mem) top_data, 0); - } else { - greentea_gpu_scale(this->device_->id(), - count, inner_scale_, - (cl_mem) bottom_data, 0, (cl_mem) top_data, 0); - greentea_gpu_exp(this->device_->id(), count, - (cl_mem) top_data, 0, (cl_mem) top_data, 0); - } - if (outer_scale_ != Dtype(1)) { - greentea_gpu_scal(this->device_->id(), - count, outer_scale_, - (cl_mem) top_data, 0); - } -#endif // USE_GREENTEA + caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); + caffe_gpu_exp(count, top_data, top_data); + } + if (outer_scale_ != Dtype(1)) { + caffe_gpu_scal(count, outer_scale_, top_data); } } @@ -57,23 +35,9 @@ void ExpLayer::Backward_gpu(const vector*>& top, const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_mul(count, top_data, top_diff, bottom_diff); - if (inner_scale_ != Dtype(1)) { - caffe_gpu_scal(count, inner_scale_, bottom_diff); - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_mul(this->device_->id(), count, - (cl_mem) top_data, 0, (cl_mem) top_diff, 0, - (cl_mem) bottom_diff, 0); - if (inner_scale_ != Dtype(1)) { - greentea_gpu_scal(this->device_->id(), count, inner_scale_, - (cl_mem) bottom_diff, 0); - } -#endif // USE_GREENTEA + caffe_gpu_mul(count, top_data, top_diff, bottom_diff); + if (inner_scale_ != Dtype(1)) { + caffe_gpu_scal(count, inner_scale_, bottom_diff); } } diff --git a/src/caffe/layers/filter_layer.cu b/src/caffe/layers/filter_layer.cu index b2bb13aa6cf..ac77fa6cc4e 100644 --- a/src/caffe/layers/filter_layer.cu +++ b/src/caffe/layers/filter_layer.cu @@ -18,20 +18,8 @@ void FilterLayer::Forward_gpu(const vector*>& bottom, int_tp data_offset_top = n * dim; int_tp data_offset_bottom = indices_to_forward_[n] * dim; - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_copy(dim, bottom_data + data_offset_bottom, - top_data + data_offset_top); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - greentea_copy(dim, (cl_mem) bottom_data, data_offset_bottom, - (cl_mem) top_data, data_offset_top, &ctx); -#endif // USE_GREENTEA - } + caffe_copy(dim, bottom_data + data_offset_bottom, + top_data + data_offset_top); } } } @@ -54,59 +42,26 @@ void FilterLayer::Backward_gpu(const vector*>& top, int_tp data_offset_bottom = 0; int_tp data_offset_top = 0; - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - for (int_tp n = 0; n < bottom[i]->shape(0); ++n) { - if (next_to_backward_offset >= indices_to_forward_.size()) { - // we already visited all items that were been forwarded, so - // just set to zero remaining ones - data_offset_bottom = n * dim; + for (int_tp n = 0; n < bottom[i]->shape(0); ++n) { + if (next_to_backward_offset >= indices_to_forward_.size()) { + // we already visited all items that were been forwarded, so + // just set to zero remaining ones + data_offset_bottom = n * dim; + caffe_gpu_set(dim, Dtype(0), + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } else { + batch_offset = indices_to_forward_[next_to_backward_offset]; + data_offset_bottom = n * dim; + if (n != batch_offset) { // this data was not been forwarded caffe_gpu_set(dim, Dtype(0), bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { - batch_offset = indices_to_forward_[next_to_backward_offset]; - data_offset_bottom = n * dim; - if (n != batch_offset) { // this data was not been forwarded - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { // this data was been forwarded - data_offset_top = next_to_backward_offset * dim; - ++next_to_backward_offset; // point to next forwarded item index - caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } - } - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - for (int_tp n = 0; n < bottom[i]->shape(0); ++n) { - if (next_to_backward_offset >= indices_to_forward_.size()) { - // we already visited all items that were been forwarded, so - // just set to zero remaining ones - data_offset_bottom = n * dim; - greentea_gpu_set(this->device_->id(), dim, Dtype(0), - (cl_mem)(bottom[i]->mutable_gpu_diff()), data_offset_bottom); - } else { - batch_offset = indices_to_forward_[next_to_backward_offset]; - data_offset_bottom = n * dim; - if (n != batch_offset) { // this data was not been forwarded - greentea_gpu_set(this->device_->id(), dim, Dtype(0), - (cl_mem)(bottom[i]->mutable_gpu_diff()), data_offset_bottom); - } else { // this data was been forwarded - data_offset_top = next_to_backward_offset * dim; - ++next_to_backward_offset; // point to next forwarded item index - greentea_copy(dim, (cl_mem)(top[i]->mutable_gpu_diff()), - data_offset_top, - (cl_mem)(bottom[i]->mutable_gpu_diff()), - data_offset_bottom, &ctx); - } + } else { // this data was been forwarded + data_offset_top = next_to_backward_offset * dim; + ++next_to_backward_offset; // point to next forwarded item index + caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, + bottom[i]->mutable_gpu_diff() + data_offset_bottom); } } -#endif // USE_GREENTEA } } } diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu index a07ebb4d33b..099ccb08132 100644 --- a/src/caffe/layers/im2col_layer.cu +++ b/src/caffe/layers/im2col_layer.cu @@ -5,8 +5,7 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_im2col.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/math_functions.hpp" #endif namespace caffe { @@ -18,60 +17,23 @@ void Im2colLayer::Forward_gpu(const vector*>& bottom, Dtype* top_data = top[0]->mutable_gpu_data(); const int_tp num_kernels = channels_ * top[0]->count(channel_axis_ + 1); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - for (int_tp n = 0; n < num_; ++n) { - if (!force_nd_im2col_ && num_spatial_axes_ == 2) { - im2col_gpu(bottom_data + n * bottom_dim_, channels_, - bottom[0]->shape(channel_axis_ + 1), - bottom[0]->shape(channel_axis_ + 2), - kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], - pad_.cpu_data()[0], pad_.cpu_data()[1], - stride_.cpu_data()[0], stride_.cpu_data()[1], - dilation_.cpu_data()[0], dilation_.cpu_data()[1], - top_data + n * top_dim_); - } else { - im2col_nd_gpu(bottom_data + n * bottom_dim_, num_spatial_axes_, - num_kernels, bottom[0]->gpu_shape() + channel_axis_, - top[0]->gpu_shape() + channel_axis_, - kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(), - dilation_.gpu_data(), top_data + n * top_dim_); - } + for (int_tp n = 0; n < num_; ++n) { + if (!force_nd_im2col_ && num_spatial_axes_ == 2) { + im2col_gpu(bottom_data + n * bottom_dim_, channels_, + bottom[0]->shape(channel_axis_ + 1), + bottom[0]->shape(channel_axis_ + 2), + kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], + pad_.cpu_data()[0], pad_.cpu_data()[1], + stride_.cpu_data()[0], stride_.cpu_data()[1], + dilation_.cpu_data()[0], dilation_.cpu_data()[1], + top_data + n * top_dim_); + } else { + im2col_nd_gpu(bottom_data + n * bottom_dim_, num_spatial_axes_, + num_kernels, bottom[0]->gpu_shape() + channel_axis_, + top[0]->gpu_shape() + channel_axis_, + kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(), + dilation_.gpu_data(), top_data + n * top_dim_); } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - viennacl::ocl::program &program = this->device_->program(); - - for (int_tp n = 0; n < num_; ++n) { - if (!force_nd_im2col_ && num_spatial_axes_ == 2) { - greentea_im2col_gpu(&program, &ctx, (cl_mem) bottom_data, - n * bottom_dim_, channels_, - bottom[0]->shape(channel_axis_ + 1), - bottom[0]->shape(channel_axis_ + 2), - kernel_shape_.cpu_data()[0], - kernel_shape_.cpu_data()[1], - pad_.cpu_data()[0], pad_.cpu_data()[1], - stride_.cpu_data()[0], stride_.cpu_data()[1], - dilation_.cpu_data()[0], - dilation_.cpu_data()[1], (cl_mem) top_data, - n * top_dim_); - } else { - greentea_im2col_nd_gpu(&program, &ctx, (cl_mem) bottom_data, - n * bottom_dim_, num_spatial_axes_, - channel_axis_, num_kernels, - (cl_mem) (bottom[0]->gpu_shape()), - (cl_mem) (top[0]->gpu_shape()), - (cl_mem) (kernel_shape_.gpu_data()), - (cl_mem) (pad_.gpu_data()), - (cl_mem) (stride_.gpu_data()), - (cl_mem) (dilation_.gpu_data()), - (cl_mem) top_data, n * top_dim_); - } - } -#endif // USE_GREENTEA } } @@ -82,61 +44,24 @@ void Im2colLayer::Backward_gpu(const vector*>& top, const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - for (int n = 0; n < num_; ++n) { - if (!force_nd_im2col_ && num_spatial_axes_ == 2) { - col2im_gpu(top_diff + n * top_dim_, channels_, - bottom[0]->shape(channel_axis_ + 1), - bottom[0]->shape(channel_axis_ + 2), - kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], - pad_.cpu_data()[0], pad_.cpu_data()[1], - stride_.cpu_data()[0], stride_.cpu_data()[1], - dilation_.cpu_data()[0], dilation_.cpu_data()[1], - bottom_diff + n * bottom_dim_); - } else { - col2im_nd_gpu(top_diff + n * top_dim_, num_spatial_axes_, bottom_dim_, - bottom[0]->gpu_shape() + channel_axis_, - top[0]->gpu_shape() + channel_axis_, - kernel_shape_.gpu_data(), pad_.gpu_data(), - stride_.gpu_data(), dilation_.gpu_data(), - bottom_diff + n * bottom_dim_); - } - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - viennacl::ocl::program &program = this->device_->program(); - - for (int_tp n = 0; n < top[0]->num(); ++n) { - if (!force_nd_im2col_ && num_spatial_axes_ == 2) { - greentea_col2im_gpu(&program, &ctx, (cl_mem) top_diff, - n * top_dim_, channels_, - bottom[0]->shape(channel_axis_ + 1), - bottom[0]->shape(channel_axis_ + 2), - kernel_shape_.cpu_data()[0], - kernel_shape_.cpu_data()[1], - pad_.cpu_data()[0], pad_.cpu_data()[1], - stride_.cpu_data()[0], stride_.cpu_data()[1], - dilation_.cpu_data()[0], - dilation_.cpu_data()[1], - (cl_mem) bottom_diff, n * bottom_dim_); - } else { - greentea_col2im_nd_gpu(&program, &ctx, (cl_mem) top_diff, - n * top_dim_, num_spatial_axes_, - channel_axis_, bottom_dim_, - (cl_mem) (bottom[0]->gpu_shape()), - (cl_mem) (top[0]->gpu_shape()), - (cl_mem) (kernel_shape_.gpu_data()), - (cl_mem) (pad_.gpu_data()), - (cl_mem) (stride_.gpu_data()), - (cl_mem) (dilation_.gpu_data()), - (cl_mem) bottom_diff, n * bottom_dim_); - } + for (int n = 0; n < num_; ++n) { + if (!force_nd_im2col_ && num_spatial_axes_ == 2) { + col2im_gpu(top_diff + n * top_dim_, channels_, + bottom[0]->shape(channel_axis_ + 1), + bottom[0]->shape(channel_axis_ + 2), + kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], + pad_.cpu_data()[0], pad_.cpu_data()[1], + stride_.cpu_data()[0], stride_.cpu_data()[1], + dilation_.cpu_data()[0], dilation_.cpu_data()[1], + bottom_diff + n * bottom_dim_); + } else { + col2im_nd_gpu(top_diff + n * top_dim_, num_spatial_axes_, bottom_dim_, + bottom[0]->gpu_shape() + channel_axis_, + top[0]->gpu_shape() + channel_axis_, + kernel_shape_.gpu_data(), pad_.gpu_data(), + stride_.gpu_data(), dilation_.gpu_data(), + bottom_diff + n * bottom_dim_); } -#endif // USE_GREENTEA } } diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu index ca999436775..8e1a7d316fa 100644 --- a/src/caffe/layers/inner_product_layer.cu +++ b/src/caffe/layers/inner_product_layer.cu @@ -13,9 +13,7 @@ void InnerProductLayer::Forward_gpu(const vector*>& bottom, Dtype* top_data = top[0]->mutable_gpu_data(); const Dtype* weight = this->blobs_[0]->gpu_data(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (M_ == 1) { + if (M_ == 1) { caffe_gpu_gemv(CblasNoTrans, N_, K_, (Dtype) 1., weight, bottom_data, (Dtype) 0., top_data); if (bias_term_) @@ -32,42 +30,13 @@ void InnerProductLayer::Forward_gpu(const vector*>& bottom, this->blobs_[1]->gpu_data(), (Dtype) 1., top_data); } -#endif // USE CUDA - } else { -#ifdef USE_GREENTEA - if (M_ == 1) { - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, N_, - K_, (Dtype) 1., (cl_mem) weight, 0, - (cl_mem) bottom_data, 0, (Dtype) 0., - (cl_mem) top_data, 0); - if (bias_term_) - greentea_gpu_axpy(this->device_->id(), N_, - bias_multiplier_.cpu_data()[0], - (cl_mem) (this->blobs_[1]->gpu_data()), 0, - (cl_mem) top_data, 0); - } else { - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - transpose_ ? CblasNoTrans : CblasTrans, - M_, N_, K_, (Dtype) 1., - (cl_mem) bottom_data, 0, (cl_mem) weight, 0, - (Dtype) 0., (cl_mem) top_data, 0); - if (bias_term_) - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - CblasNoTrans, M_, N_, 1, (Dtype) 1., - (cl_mem) (bias_multiplier_.gpu_data()), 0, - (cl_mem) (this->blobs_[1]->gpu_data()), 0, - (Dtype) 1., (cl_mem) top_data, 0); - } -#endif // USE_GREENTEA - } } template void InnerProductLayer::Backward_gpu( const vector*>& top, const vector& propagate_down, const vector*>& bottom) { - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA + if (this->param_propagate_down_[0]) { const Dtype* top_diff = top[0]->gpu_diff(); const Dtype* bottom_data = bottom[0]->gpu_data(); @@ -104,58 +73,6 @@ void InnerProductLayer::Backward_gpu( (Dtype) 0., bottom[0]->mutable_gpu_diff()); } } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - if (this->param_propagate_down_[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Gradient with respect to weight - if (transpose_) { - greentea_gpu_gemm(this->device_->id(), CblasTrans, CblasNoTrans, - K_, N_, M_, (Dtype) 1., (cl_mem) bottom_data, - 0, (cl_mem) top_diff, 0, (Dtype) 1., - (cl_mem) (this->blobs_[0]->mutable_gpu_diff()), - 0); - } else { - greentea_gpu_gemm(this->device_->id(), CblasTrans, CblasNoTrans, - N_, K_, M_, (Dtype) 1., (cl_mem) top_diff, 0, - (cl_mem) bottom_data, 0, (Dtype) 1., - (cl_mem) (this->blobs_[0]->mutable_gpu_diff()), - 0); - } - } - if (bias_term_ && this->param_propagate_down_[1]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bias - greentea_gpu_gemv(this->device_->id(), CblasTrans, M_, N_, - (Dtype) 1., (cl_mem) top_diff, 0, - (cl_mem) (bias_multiplier_.gpu_data()), 0, - (Dtype) 1., - (cl_mem) (this->blobs_[1]->mutable_gpu_diff()), - 0); - } - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bottom data - if (transpose_) { - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - CblasTrans, M_, K_, N_, (Dtype) 1., - (cl_mem) top_diff, 0, - (cl_mem) (this->blobs_[0]->gpu_data()), 0, - (Dtype) 0., - (cl_mem) (bottom[0]->mutable_gpu_diff()), 0); - } else { - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - CblasNoTrans, M_, K_, N_, (Dtype) 1., - (cl_mem) top_diff, 0, - (cl_mem) (this->blobs_[0]->gpu_data()), 0, - (Dtype) 0., - (cl_mem) (bottom[0]->mutable_gpu_diff()), 0); - } - } -#endif // USE_GREENTEA - } } INSTANTIATE_LAYER_GPU_FUNCS(InnerProductLayer); diff --git a/src/caffe/layers/log_layer.cu b/src/caffe/layers/log_layer.cu index d22a1fbf503..48c91754eb8 100644 --- a/src/caffe/layers/log_layer.cu +++ b/src/caffe/layers/log_layer.cu @@ -12,51 +12,20 @@ void LogLayer::Forward_gpu(const vector*>& bottom, const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { + if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { caffe_gpu_log(count, bottom_data, top_data); - } else { - caffe_copy(count, bottom_data, top_data); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, top_data); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, top_data); - } - caffe_gpu_log(count, top_data, top_data); - } - if (base_scale_ != Dtype(1)) { - caffe_gpu_scal(count, base_scale_, top_data); - } -#endif // USE_CUDA } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { - greentea_gpu_log(this->device_->id(), count, - (cl_mem) bottom_data, 0, (cl_mem) top_data, 0); - } else { - greentea_copy(count, (cl_mem) bottom_data, 0, (cl_mem) top_data, 0, - &ctx); - if (input_scale_ != Dtype(1)) { - greentea_gpu_scal(this->device_->id(), count, - input_scale_, (cl_mem) top_data, 0); - } - if (input_shift_ != Dtype(0)) { - greentea_gpu_add_scalar(this->device_->id(), count, - input_shift_, (cl_mem) top_data, 0); - } - greentea_gpu_log(this->device_->id(), count, - (cl_mem) top_data, 0, (cl_mem) top_data, 0); + caffe_copy(count, bottom_data, top_data); + if (input_scale_ != Dtype(1)) { + caffe_gpu_scal(count, input_scale_, top_data); } - if (base_scale_ != Dtype(1)) { - greentea_gpu_scal(this->device_->id(), count, base_scale_, - (cl_mem) top_data, 0); + if (input_shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, input_shift_, top_data); } -#endif // USE_GREENTEA + caffe_gpu_log(count, top_data, top_data); + } + if (base_scale_ != Dtype(1)) { + caffe_gpu_scal(count, base_scale_, top_data); } } @@ -72,48 +41,18 @@ void LogLayer::Backward_gpu(const vector*>& top, const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_copy(count, bottom_data, bottom_diff); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, bottom_diff); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, bottom_diff); - } - caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); - if (backward_num_scale_ != Dtype(1)) { - caffe_gpu_scal(count, backward_num_scale_, bottom_diff); - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - greentea_copy(count, (cl_mem) bottom_data, 0, (cl_mem) bottom_diff, - 0, &ctx); - if (input_scale_ != Dtype(1)) { - greentea_gpu_scal(this->device_->id(), count, input_scale_, - (cl_mem) bottom_diff, 0); - } - if (input_shift_ != Dtype(0)) { - greentea_gpu_add_scalar(this->device_->id(), count, - input_shift_, (cl_mem) bottom_diff, 0); - } - greentea_gpu_powx(this->device_->id(), count, - (cl_mem) bottom_diff, 0, Dtype(-1), - (cl_mem) bottom_diff, 0); - if (backward_num_scale_ != Dtype(1)) { - greentea_gpu_scal(this->device_->id(), count, - backward_num_scale_, (cl_mem) bottom_diff, 0); - } - greentea_gpu_mul(this->device_->id(), count, - (cl_mem) top_diff, 0, (cl_mem) bottom_diff, 0, - (cl_mem) bottom_diff, 0); -#endif // USE_GREENTEA + caffe_copy(count, bottom_data, bottom_diff); + if (input_scale_ != Dtype(1)) { + caffe_gpu_scal(count, input_scale_, bottom_diff); + } + if (input_shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, input_shift_, bottom_diff); + } + caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); + if (backward_num_scale_ != Dtype(1)) { + caffe_gpu_scal(count, backward_num_scale_, bottom_diff); } + caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } INSTANTIATE_LAYER_GPU_FUNCS(LogLayer); diff --git a/src/caffe/layers/lrn_layer.cu b/src/caffe/layers/lrn_layer.cu index 9d861434629..462d40e40f7 100644 --- a/src/caffe/layers/lrn_layer.cu +++ b/src/caffe/layers/lrn_layer.cu @@ -118,19 +118,25 @@ void LRNLayer::CrossChannelForward_gpu( int_tp n_threads = num_ * height_ * width_; viennacl::ocl::kernel &oclk_lrn_fill = program.get_kernel( CL_KERNEL_SELECT("lrn_fill_scale")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_scale = clState.get_buffer_mem(scale_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_lrn_fill(n_threads, WrapHandle((cl_mem) bottom_data, &ctx), num_, + oclk_lrn_fill(n_threads, WrapHandle(buf_bottom.memobj, &ctx), num_, channels_, height_, width_, size_, alpha_ / size_, k_, - WrapHandle((cl_mem) scale_data, &ctx)), + WrapHandle(buf_scale.memobj, &ctx)), ctx.get_queue()); - n_threads = bottom[0]->count(); viennacl::ocl::kernel &oclk_lrn_compute = program.get_kernel( CL_KERNEL_SELECT("lrn_compute_output")); + viennacl::ocl::enqueue( - oclk_lrn_compute(n_threads, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) scale_data, &ctx), -beta_, - WrapHandle((cl_mem) top_data, &ctx)), + oclk_lrn_compute(n_threads, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_scale.memobj, &ctx), -beta_, + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -243,14 +249,24 @@ void LRNLayer::CrossChannelBackward_gpu( viennacl::ocl::kernel &oclk_lrn = program.get_kernel( CL_KERNEL_SELECT("lrn_compute_diff")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom_data = + clState.get_buffer_mem(bottom[0]->gpu_data()); + ClMemOff buf_top_data = clState.get_buffer_mem(top[0]->gpu_data()); + ClMemOff buf_scale = clState.get_buffer_mem(scale_.gpu_data()); + ClMemOff buf_top_diff = clState.get_buffer_mem(top[0]->gpu_diff()); + ClMemOff buf_bottom_diff = + clState.get_buffer_mem(bottom[0]->mutable_gpu_diff()); + viennacl::ocl::enqueue( - oclk_lrn(n_threads, WrapHandle((cl_mem) (bottom[0]->gpu_data()), &ctx), - WrapHandle((cl_mem) (top[0]->gpu_data()), &ctx), - WrapHandle((cl_mem) (scale_.gpu_data()), &ctx), - WrapHandle((cl_mem) (top[0]->gpu_diff()), &ctx), num_, + oclk_lrn(n_threads, WrapHandle(buf_bottom_data.memobj, &ctx), + WrapHandle(buf_top_data.memobj, &ctx), + WrapHandle(buf_scale.memobj, &ctx), + WrapHandle(buf_top_diff.memobj, &ctx), num_, channels_, height_, width_, size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), - WrapHandle((cl_mem) (bottom[0]->mutable_gpu_diff()), &ctx)), + WrapHandle(buf_bottom_diff.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/layers/lstm_unit_layer.cu b/src/caffe/layers/lstm_unit_layer.cu index 1fc38b99c91..fa12fe583b5 100644 --- a/src/caffe/layers/lstm_unit_layer.cu +++ b/src/caffe/layers/lstm_unit_layer.cu @@ -87,18 +87,26 @@ void LSTMUnitLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::kernel &oclk_lstm_unit_forward = program.get_kernel( CL_KERNEL_SELECT("lstm_unit_forward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_X = clState.get_buffer_mem(X); + ClMemOff buf_X_acts = clState.get_buffer_mem(X_acts); + ClMemOff buf_C_prev = clState.get_buffer_mem(C_prev); + ClMemOff buf_cont = clState.get_buffer_mem(cont); + ClMemOff buf_C = clState.get_buffer_mem(C); + ClMemOff buf_H = clState.get_buffer_mem(H); + viennacl::ocl::enqueue( oclk_lstm_acts_forward(X_count, hidden_dim_, - WrapHandle((cl_mem)X, &ctx), - WrapHandle((cl_mem)X_acts, &ctx)), + WrapHandle(buf_X.memobj, &ctx), + WrapHandle(buf_X_acts.memobj, &ctx)), ctx.get_queue()); viennacl::ocl::enqueue( oclk_lstm_unit_forward(count, hidden_dim_, - WrapHandle((cl_mem)C_prev, &ctx), - WrapHandle((cl_mem)X_acts, &ctx), - WrapHandle((cl_mem)cont, &ctx), - WrapHandle((cl_mem)C, &ctx), - WrapHandle((cl_mem)H, &ctx)), + WrapHandle(buf_C_prev.memobj, &ctx), + WrapHandle(buf_X_acts.memobj, &ctx), + WrapHandle(buf_cont.memobj, &ctx), + WrapHandle(buf_C.memobj, &ctx), + WrapHandle(buf_H.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -198,20 +206,36 @@ void LSTMUnitLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_lstm_acts_backward = program.get_kernel( CL_KERNEL_SELECT("lstm_acts_backward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_X_acts = clState.get_buffer_mem(X_acts); + ClMemOff buf_C_prev = clState.get_buffer_mem(C_prev); + ClMemOff buf_cont = clState.get_buffer_mem(cont); + ClMemOff buf_C = clState.get_buffer_mem(C); + ClMemOff buf_H = clState.get_buffer_mem(H); + ClMemOff buf_C_diff = clState.get_buffer_mem(C_diff); + ClMemOff buf_H_diff = clState.get_buffer_mem(H_diff); + ClMemOff buf_C_prev_diff = clState.get_buffer_mem(C_prev_diff); + ClMemOff buf_X_acts_diff = clState.get_buffer_mem(X_acts_diff); + ClMemOff buf_X_diff = clState.get_buffer_mem(X_diff); + + viennacl::ocl::enqueue( oclk_lstm_unit_backward(count, hidden_dim_, - WrapHandle((cl_mem)C_prev, &ctx), WrapHandle((cl_mem)X_acts, &ctx), - WrapHandle((cl_mem)C, &ctx), WrapHandle((cl_mem)H, &ctx), - WrapHandle((cl_mem)cont, &ctx), WrapHandle((cl_mem)C_diff, &ctx), - WrapHandle((cl_mem)H_diff, &ctx), - WrapHandle((cl_mem)C_prev_diff, &ctx), - WrapHandle((cl_mem)X_acts_diff, &ctx)), + WrapHandle(buf_C_prev.memobj, &ctx), + WrapHandle(buf_X_acts.memobj, &ctx), + WrapHandle(buf_C.memobj, &ctx), + WrapHandle(buf_H.memobj, &ctx), + WrapHandle(buf_cont.memobj, &ctx), + WrapHandle(buf_C_diff.memobj, &ctx), + WrapHandle(buf_H_diff.memobj, &ctx), + WrapHandle(buf_C_prev_diff.memobj, &ctx), + WrapHandle(buf_X_acts_diff.memobj, &ctx)), ctx.get_queue()); viennacl::ocl::enqueue( oclk_lstm_acts_backward(X_count, hidden_dim_, - WrapHandle((cl_mem)X_acts, &ctx), - WrapHandle((cl_mem)X_acts_diff, &ctx), - WrapHandle((cl_mem)X_diff, &ctx)), + WrapHandle(buf_X_acts.memobj, &ctx), + WrapHandle(buf_X_acts_diff.memobj, &ctx), + WrapHandle(buf_X_diff.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/layers/mergecrop_layer.cu b/src/caffe/layers/mergecrop_layer.cu index 0de956d2b84..9004343f1b6 100644 --- a/src/caffe/layers/mergecrop_layer.cu +++ b/src/caffe/layers/mergecrop_layer.cu @@ -6,7 +6,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -248,19 +247,26 @@ void MergeCropLayer::Forward_gpu(const vector*>& bottom, this->device_->id()); viennacl::ocl::program &program = this->device_->program(); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom_a = clState.get_buffer_mem(bottom_data_a); + ClMemOff buf_bottom_b = clState.get_buffer_mem(bottom_data_b); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + ClMemOff buf_shape_a = clState.get_buffer_mem(shape_a_.gpu_data()); + ClMemOff buf_shape_b = clState.get_buffer_mem(shape_b_.gpu_data()); + switch (op_) { case MergeCropParameter_MergeOp_STACK: { viennacl::ocl::kernel &oclk_copy_forward = program.get_kernel( CL_KERNEL_SELECT("merge_copy_forward_stack")); viennacl::ocl::enqueue( oclk_copy_forward(count, spatial_dims, - WrapHandle((cl_mem) bottom_data_a, &ctx), + WrapHandle(buf_bottom_a.memobj, &ctx), forward_[0], - WrapHandle((cl_mem) bottom_data_b, &ctx), - forward_[1], WrapHandle((cl_mem) top_data, &ctx), + WrapHandle(buf_bottom_b.memobj, &ctx), + forward_[1], WrapHandle(buf_top.memobj, &ctx), num, channels_a, channels_b, - WrapHandle((cl_mem) (shape_a_.gpu_data()), &ctx), - WrapHandle((cl_mem) (shape_b_.gpu_data()), &ctx)), + WrapHandle(buf_shape_a.memobj, &ctx), + WrapHandle(buf_shape_b.memobj, &ctx)), ctx.get_queue()); } break; @@ -269,13 +275,13 @@ void MergeCropLayer::Forward_gpu(const vector*>& bottom, CL_KERNEL_SELECT("merge_copy_forward_add")); viennacl::ocl::enqueue( oclk_copy_forward(count, spatial_dims, - WrapHandle((cl_mem) bottom_data_a, &ctx), + WrapHandle(buf_bottom_a.memobj, &ctx), forward_[0], - WrapHandle((cl_mem) bottom_data_b, &ctx), - forward_[1], WrapHandle((cl_mem) top_data, &ctx), + WrapHandle(buf_bottom_b.memobj, &ctx), + forward_[1], WrapHandle(buf_top.memobj, &ctx), num, channels_a, - WrapHandle((cl_mem) (shape_a_.gpu_data()), &ctx), - WrapHandle((cl_mem) (shape_b_.gpu_data()), &ctx)), + WrapHandle(buf_shape_a.memobj, &ctx), + WrapHandle(buf_shape_b.memobj, &ctx)), ctx.get_queue()); } break; @@ -335,18 +341,25 @@ void MergeCropLayer::Backward_gpu(const vector*>& top, this->device_->id()); viennacl::ocl::program &program = this->device_->program(); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom_a = clState.get_buffer_mem(bottom_diff_a); + ClMemOff buf_bottom_b = clState.get_buffer_mem(bottom_diff_b); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_shape_a = clState.get_buffer_mem(shape_a_.gpu_data()); + ClMemOff buf_shape_b = clState.get_buffer_mem(shape_b_.gpu_data()); + switch (op_) { case MergeCropParameter_MergeOp_STACK: { viennacl::ocl::kernel &oclk_copy_backward = program.get_kernel( CL_KERNEL_SELECT("merge_copy_backward_stack")); viennacl::ocl::enqueue( oclk_copy_backward( - count, spatial_dims, WrapHandle((cl_mem) bottom_diff_a, &ctx), - backward_[0], WrapHandle((cl_mem) bottom_diff_b, &ctx), - backward_[1], WrapHandle((cl_mem) top_diff, &ctx), num, + count, spatial_dims, WrapHandle(buf_bottom_a.memobj, &ctx), + backward_[0], WrapHandle(buf_bottom_b.memobj, &ctx), + backward_[1], WrapHandle(buf_top.memobj, &ctx), num, channels_a, channels_b, - WrapHandle((cl_mem) (shape_a_.gpu_data()), &ctx), - WrapHandle((cl_mem) (shape_b_.gpu_data()), &ctx)), + WrapHandle(buf_shape_a.memobj, &ctx), + WrapHandle(buf_shape_b.memobj, &ctx)), ctx.get_queue()); } break; @@ -355,16 +368,15 @@ void MergeCropLayer::Backward_gpu(const vector*>& top, CL_KERNEL_SELECT("merge_copy_backward_add")); viennacl::ocl::enqueue( oclk_copy_backward( - count, spatial_dims, WrapHandle((cl_mem) bottom_diff_a, &ctx), - backward_[0], WrapHandle((cl_mem) bottom_diff_b, &ctx), - backward_[1], WrapHandle((cl_mem) top_diff, &ctx), num, - channels_a, WrapHandle((cl_mem) (shape_a_.gpu_data()), &ctx), - WrapHandle((cl_mem) (shape_b_.gpu_data()), &ctx)), + count, spatial_dims, WrapHandle(buf_bottom_a.memobj, &ctx), + backward_[0], WrapHandle(buf_bottom_b.memobj, &ctx), + backward_[1], WrapHandle(buf_top.memobj, &ctx), num, + channels_a, WrapHandle(buf_shape_a.memobj, &ctx), + WrapHandle(buf_shape_b.memobj, &ctx)), ctx.get_queue()); } break; } - ctx.get_queue().finish(); #endif // USE_GREENTEA } diff --git a/src/caffe/layers/mvn_layer.cu b/src/caffe/layers/mvn_layer.cu index 5264c912c8a..517c7793446 100644 --- a/src/caffe/layers/mvn_layer.cu +++ b/src/caffe/layers/mvn_layer.cu @@ -18,89 +18,36 @@ void MVNLayer::Forward_gpu(const vector*>& bottom, int_tp dim = bottom[0]->count() / num; - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - // subtract mean - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, + // subtract mean + caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, + sum_multiplier_.gpu_data(), 0., + mean_.mutable_gpu_data()); // EX + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + // X-EX + caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); + + if (this->layer_param_.mvn_param().normalize_variance()) { + // compute variance using var(X) = E((X-EX)^2) + caffe_gpu_powx(bottom[0]->count(), top_data, Dtype(2), + temp_.mutable_gpu_data()); // (X-EX)^2 + caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), sum_multiplier_.gpu_data(), 0., - mean_.mutable_gpu_data()); // EX - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - // X-EX - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - - if (this->layer_param_.mvn_param().normalize_variance()) { - // compute variance using var(X) = E((X-EX)^2) - caffe_gpu_powx(bottom[0]->count(), top_data, Dtype(2), - temp_.mutable_gpu_data()); // (X-EX)^2 - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), - sum_multiplier_.gpu_data(), 0., - variance_.mutable_gpu_data()); // E((X-EX)^2) - - // normalize variance - caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), - variance_.mutable_gpu_data()); - - caffe_gpu_add_scalar(variance_.count(), eps_, - variance_.mutable_gpu_data()); - - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), - 0., temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - // subtract mean - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, num, dim, - 1. / dim, (cl_mem) (bottom_data), 0, - (cl_mem) (sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (mean_.mutable_gpu_data()), 0); // EX - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, CblasNoTrans, - num, dim, 1, -1., (cl_mem) (mean_.gpu_data()), 0, - (cl_mem) (sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (temp_.mutable_gpu_data()), 0); - greentea_gpu_add(this->device_->id(), temp_.count(), - (cl_mem) (bottom_data), 0, - (cl_mem) (temp_.gpu_data()), 0, (cl_mem) (top_data), - 0); // X-EX - - if (this->layer_param_.mvn_param().normalize_variance()) { - // compute variance using var(X) = E((X-EX)^2) - // (X-EX)^2 - greentea_gpu_powx(this->device_->id(), bottom[0]->count(), - (cl_mem) (top_data), 0, Dtype(2), - (cl_mem) (temp_.mutable_gpu_data()), 0); - // E((X-EX)^2) - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, num, dim, - 1. / dim, (cl_mem) (temp_.gpu_data()), 0, - (cl_mem) (sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (variance_.mutable_gpu_data()), 0); - - // normalize variance - greentea_gpu_powx(this->device_->id(), variance_.count(), - (cl_mem) (variance_.gpu_data()), 0, Dtype(0.5), - (cl_mem) (variance_.mutable_gpu_data()), 0); - - greentea_gpu_add_scalar(this->device_->id(), variance_.count(), - eps_, - (cl_mem) (variance_.mutable_gpu_data()), - 0); - - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, CblasNoTrans, - num, dim, 1, 1., (cl_mem) (variance_.gpu_data()), - 0, (cl_mem) (sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (temp_.mutable_gpu_data()), 0); - - greentea_gpu_div(this->device_->id(), temp_.count(), - (cl_mem) (top_data), 0, - (cl_mem) (temp_.gpu_data()), 0, - (cl_mem) (top_data), 0); - } -#endif // USE_GREENTEA + variance_.mutable_gpu_data()); // E((X-EX)^2) + + // normalize variance + caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), + variance_.mutable_gpu_data()); + + caffe_gpu_add_scalar(variance_.count(), eps_, + variance_.mutable_gpu_data()); + + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + variance_.gpu_data(), sum_multiplier_.gpu_data(), + 0., temp_.mutable_gpu_data()); + + caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); } } @@ -121,111 +68,44 @@ void MVNLayer::Backward_gpu(const vector*>& top, int_tp dim = bottom[0]->count() / num; - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (this->layer_param_.mvn_param().normalize_variance()) { - caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); - caffe_gpu_gemv(CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.gpu_data(), 0., - mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - bottom_diff); - caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); - - caffe_gpu_gemv(CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.gpu_data(), 0., - mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., - bottom_diff); - - caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), - bottom_diff); - - // put the squares of bottom into temp_ - caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), - 0., temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); - } else { - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, top_diff, - sum_multiplier_.gpu_data(), 0., - mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), - bottom_diff); - } -#endif // USE_CUDA + if (this->layer_param_.mvn_param().normalize_variance()) { + caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); + caffe_gpu_gemv(CblasNoTrans, num, dim, 1., bottom_diff, + sum_multiplier_.gpu_data(), 0., + mean_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + bottom_diff); + caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); + + caffe_gpu_gemv(CblasNoTrans, num, dim, 1., top_diff, + sum_multiplier_.gpu_data(), 0., + mean_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., + bottom_diff); + + caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), + bottom_diff); + + // put the squares of bottom into temp_ + caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), + temp_.mutable_gpu_data()); + + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., + variance_.gpu_data(), sum_multiplier_.gpu_data(), + 0., temp_.mutable_gpu_data()); + + caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); } else { -#ifdef USE_GREENTEA - if (this->layer_param_.mvn_param().normalize_variance()) { - greentea_gpu_mul(this->device_->id(), temp_.count(), - (cl_mem) top_data, 0, (cl_mem) top_diff, 0, - (cl_mem) bottom_diff, 0); - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, num, - dim, 1., (cl_mem) bottom_diff, 0, - (cl_mem) (sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (mean_.mutable_gpu_data()), 0); - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - CblasNoTrans, num, dim, 1, 1., - (cl_mem) (mean_.gpu_data()), 0, - (cl_mem) (sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) bottom_diff, 0); - greentea_gpu_mul(this->device_->id(), temp_.count(), - (cl_mem) top_data, 0, (cl_mem) bottom_diff, 0, - (cl_mem) bottom_diff, 0); - - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, num, - dim, 1., (cl_mem) top_diff, 0, - (cl_mem) (sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (mean_.mutable_gpu_data()), 0); - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - CblasNoTrans, num, dim, 1, 1., - (cl_mem) (mean_.gpu_data()), 0, - (cl_mem) (sum_multiplier_.gpu_data()), 0, 1., - (cl_mem) bottom_diff, 0); - - greentea_gpu_axpby(this->device_->id(), temp_.count(), - Dtype(1), (cl_mem) top_diff, 0, - Dtype(-1. / dim), (cl_mem) bottom_diff, 0); - - // put the squares of bottom into temp_ - greentea_gpu_powx(this->device_->id(), temp_.count(), - (cl_mem) bottom_data, 0, Dtype(2), - (cl_mem) (temp_.mutable_gpu_data()), 0); - - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - CblasNoTrans, num, dim, 1, 1., - (cl_mem) (variance_.gpu_data()), 0, - (cl_mem) (sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (temp_.mutable_gpu_data()), 0); - - greentea_gpu_div(this->device_->id(), temp_.count(), - (cl_mem) bottom_diff, 0, - (cl_mem) (temp_.gpu_data()), 0, - (cl_mem) bottom_diff, 0); - } else { - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, num, - dim, 1. / dim, (cl_mem) top_diff, 0, - (cl_mem) (sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (mean_.mutable_gpu_data()), 0); - greentea_gpu_gemm(this->device_->id(), CblasNoTrans, - CblasNoTrans, num, dim, 1, -1., - (cl_mem) (mean_.gpu_data()), 0, - (cl_mem) (sum_multiplier_.gpu_data()), 0, 0., - (cl_mem) (temp_.mutable_gpu_data()), 0); - greentea_gpu_add(this->device_->id(), temp_.count(), - (cl_mem) top_diff, 0, (cl_mem) (temp_.gpu_data()), - 0, (cl_mem) (bottom_diff), 0); - } -#endif // USE_GREENTEA + caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, top_diff, + sum_multiplier_.gpu_data(), 0., + mean_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., + mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., + temp_.mutable_gpu_data()); + caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), + bottom_diff); } } diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu index bc8bf677871..b2eb542ee3b 100644 --- a/src/caffe/layers/pooling_layer.cu +++ b/src/caffe/layers/pooling_layer.cu @@ -7,7 +7,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif // USE_GREENTEA namespace caffe { @@ -914,65 +913,86 @@ void PoolingLayer::Forward_gpu(const vector*>& bottom, } viennacl::ocl::kernel &oclk_max_pool_forward = program.get_kernel( CL_KERNEL_SELECT("max_pool_forward_sk")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_mask = clState.get_buffer_mem(mask); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top_mask = clState.get_buffer_mem(top_mask); + ClMemOff buf_top_data = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( oclk_max_pool_forward(count, - WrapHandle((cl_mem) bottom_data, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), bottom[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, ext_kernel_h, ext_kernel_w, stride_h_, stride_w_, dilation_h_, dilation_w_, pad_h_, pad_w_, - WrapHandle((cl_mem) top_data, &ctx), + WrapHandle(buf_top_data.memobj, &ctx), mask == NULL ? 0 : 1, - WrapHandle((cl_mem) mask, &ctx), - WrapHandle((cl_mem) top_mask, &ctx)), + WrapHandle(buf_mask.memobj, &ctx), + WrapHandle(buf_top_mask.memobj, &ctx)), ctx.get_queue()); } break; case PoolingParameter_PoolMethod_AVE: { viennacl::ocl::kernel &oclk_ave_pool_forward = program.get_kernel( CL_KERNEL_SELECT("ave_pool_forward_sk")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( oclk_ave_pool_forward(count, - WrapHandle((cl_mem) bottom_data, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), bottom[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, ext_kernel_h, ext_kernel_w, stride_h_, stride_w_, dilation_h_, dilation_w_, - pad_h_, pad_w_, WrapHandle((cl_mem)top_data, &ctx)), + pad_h_, pad_w_, WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); } break; case PoolingParameter_PoolMethod_STOCHASTIC: { if (this->phase_ == caffe::TRAIN) { // We need to create the random index as well. - greentea_gpu_rng_uniform(this->device_->id(), count, - Dtype(0), Dtype(1), - (cl_mem)(rand_idx_.mutable_gpu_data()), 0); + caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), + rand_idx_.mutable_gpu_data()); viennacl::ocl::kernel &oclk_sto_pool_forward = program.get_kernel( CL_KERNEL_SELECT("sto_pool_forward_train_sk")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + ClMemOff buf_rand = + clState.get_buffer_mem(rand_idx_.mutable_gpu_data()); + viennacl::ocl::enqueue( oclk_sto_pool_forward(count, - WrapHandle((cl_mem)bottom_data, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), bottom[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, ext_kernel_h, ext_kernel_w, stride_h_, stride_w_, dilation_h_, dilation_w_, - WrapHandle((cl_mem)(rand_idx_.mutable_gpu_data()), &ctx), - WrapHandle((cl_mem)(top_data), &ctx)), + WrapHandle(buf_rand.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); } else { viennacl::ocl::kernel &oclk_sto_pool_forward = program.get_kernel( CL_KERNEL_SELECT("sto_pool_forward_test_sk")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( oclk_sto_pool_forward(count, - WrapHandle((cl_mem)bottom_data, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), bottom[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, ext_kernel_h, ext_kernel_w, stride_h_, stride_w_, dilation_h_, dilation_w_, - WrapHandle((cl_mem)top_data, &ctx)), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); } } @@ -992,61 +1012,80 @@ void PoolingLayer::Forward_gpu(const vector*>& bottom, } viennacl::ocl::kernel &oclk_max_pool_forward = program.get_kernel( CL_KERNEL_SELECT("max_pool_forward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + ClMemOff buf_mask = clState.get_buffer_mem(mask); + ClMemOff buf_top_mask = clState.get_buffer_mem(top_mask); + viennacl::ocl::enqueue( oclk_max_pool_forward(count, - WrapHandle((cl_mem) bottom_data, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), bottom[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, - WrapHandle((cl_mem) top_data, &ctx), + WrapHandle(buf_top.memobj, &ctx), mask == NULL ? 0 : 1, - WrapHandle((cl_mem) mask, &ctx), - WrapHandle((cl_mem) top_mask, &ctx)), + WrapHandle(buf_mask.memobj, &ctx), + WrapHandle(buf_top_mask.memobj, &ctx)), ctx.get_queue()); } break; case PoolingParameter_PoolMethod_AVE: { viennacl::ocl::kernel &oclk_ave_pool_forward = program.get_kernel( CL_KERNEL_SELECT("ave_pool_forward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( oclk_ave_pool_forward(count, - WrapHandle((cl_mem) bottom_data, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), bottom[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, - WrapHandle((cl_mem)top_data, &ctx)), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); } break; case PoolingParameter_PoolMethod_STOCHASTIC: { if (this->phase_ == caffe::TRAIN) { // We need to create the random index as well. - greentea_gpu_rng_uniform(this->device_->id(), count, - Dtype(0), Dtype(1), - (cl_mem)(rand_idx_.mutable_gpu_data()), 0); + caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), + rand_idx_.mutable_gpu_data()); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + ClMemOff buf_rand = + clState.get_buffer_mem(rand_idx_.mutable_gpu_data()); viennacl::ocl::kernel &oclk_sto_pool_forward = program.get_kernel( CL_KERNEL_SELECT("sto_pool_forward_train")); viennacl::ocl::enqueue( oclk_sto_pool_forward(count, - WrapHandle((cl_mem)bottom_data, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), bottom[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, - WrapHandle((cl_mem)(rand_idx_.mutable_gpu_data()), &ctx), - WrapHandle((cl_mem)top_data, &ctx)), + WrapHandle(buf_rand.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); } else { viennacl::ocl::kernel &oclk_sto_pool_forward = program.get_kernel( CL_KERNEL_SELECT("sto_pool_forward_test")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( oclk_sto_pool_forward(count, - WrapHandle((cl_mem)bottom_data, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), bottom[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, - stride_h_, stride_w_, WrapHandle((cl_mem)top_data, &ctx)), + stride_h_, stride_w_, WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); } } @@ -1066,21 +1105,38 @@ void PoolingLayer::Forward_gpu(const vector*>& bottom, } viennacl::ocl::kernel &oclk_max_pool_forward = program.get_kernel( CL_KERNEL_SELECT("max_pool_forward_nd")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_size = clState.get_buffer_mem(size_.gpu_data()); + ClMemOff buf_pooled = + clState.get_buffer_mem(pooled_size_.gpu_data()); + ClMemOff buf_kernel = + clState.get_buffer_mem(kernel_shape_.gpu_data()); + ClMemOff buf_ext_kernel = + clState.get_buffer_mem(ext_kernel_shape_.gpu_data()); + ClMemOff buf_stride = clState.get_buffer_mem(stride_.gpu_data()); + ClMemOff buf_dilation = + clState.get_buffer_mem(dilation_.gpu_data()); + ClMemOff buf_pad = clState.get_buffer_mem(pad_.gpu_data()); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + ClMemOff buf_mask = clState.get_buffer_mem(mask); + ClMemOff buf_top_mask = clState.get_buffer_mem(top_mask); + viennacl::ocl::enqueue( oclk_max_pool_forward(count, num_spatial_axes_, - WrapHandle((cl_mem)bottom_data, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), channels_, - WrapHandle((cl_mem)(size_.gpu_data()), &ctx), - WrapHandle((cl_mem)(pooled_size_.gpu_data()), &ctx), - WrapHandle((cl_mem)(kernel_shape_.gpu_data()), &ctx), - WrapHandle((cl_mem)(ext_kernel_shape_.gpu_data()), &ctx), - WrapHandle((cl_mem)(stride_.gpu_data()), &ctx), - WrapHandle((cl_mem)(dilation_.gpu_data()), &ctx), - WrapHandle((cl_mem)(pad_.gpu_data()), &ctx), - WrapHandle((cl_mem)top_data, &ctx), + WrapHandle(buf_size.memobj, &ctx), + WrapHandle(buf_pooled.memobj, &ctx), + WrapHandle(buf_kernel.memobj, &ctx), + WrapHandle(buf_ext_kernel.memobj, &ctx), + WrapHandle(buf_stride.memobj, &ctx), + WrapHandle(buf_dilation.memobj, &ctx), + WrapHandle(buf_pad.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx), mask == NULL ? 0 : 1, - WrapHandle((cl_mem)mask, &ctx), - WrapHandle((cl_mem)top_mask, &ctx)), + WrapHandle(buf_mask.memobj, &ctx), + WrapHandle(buf_top_mask.memobj, &ctx)), ctx.get_queue()); } break; @@ -1218,8 +1274,7 @@ void PoolingLayer::Backward_gpu(const vector*>& top, this->device_->id()); viennacl::ocl::program &program = this->device_->program(); - greentea_gpu_set(this->device_->id(), count, Dtype(0.), - (cl_mem) bottom_diff, 0); + caffe_gpu_set(count, Dtype(0.), bottom_diff); if (num_spatial_axes_ == 2) { int_tp kernel_h_ = kernel_shape_.cpu_data()[0]; @@ -1248,18 +1303,25 @@ void PoolingLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_max_pool_backward = program.get_kernel( CL_KERNEL_SELECT("max_pool_backward_sk")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_mask = clState.get_buffer_mem(mask); + ClMemOff buf_top_mask = clState.get_buffer_mem(top_mask); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + viennacl::ocl::enqueue( oclk_max_pool_backward(count, - WrapHandle((cl_mem) top_diff, &ctx), + WrapHandle(buf_top.memobj, &ctx), mask == NULL ? 0 : 1, - WrapHandle((cl_mem) mask, &ctx), - WrapHandle((cl_mem) top_mask, &ctx), + WrapHandle(buf_mask.memobj, &ctx), + WrapHandle(buf_top_mask.memobj, &ctx), top[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, ext_kernel_h, ext_kernel_w, stride_h_, stride_w_, dilation_h_, dilation_w_, pad_h_, pad_w_, - WrapHandle((cl_mem) bottom_diff, &ctx)), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); } break; @@ -1278,17 +1340,23 @@ void PoolingLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_max_pool_backward = program.get_kernel( CL_KERNEL_SELECT("max_pool_backward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_mask = clState.get_buffer_mem(mask); + ClMemOff buf_top_mask = clState.get_buffer_mem(top_mask); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + viennacl::ocl::enqueue( oclk_max_pool_backward(count, - WrapHandle((cl_mem) top_diff, &ctx), + WrapHandle(buf_top.memobj, &ctx), mask == NULL ? 0 : 1, - WrapHandle((cl_mem) mask, &ctx), - WrapHandle((cl_mem) top_mask, &ctx), + WrapHandle(buf_mask.memobj, &ctx), + WrapHandle(buf_top_mask.memobj, &ctx), top[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, - WrapHandle((cl_mem) bottom_diff, &ctx)), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); } break; @@ -1296,14 +1364,18 @@ void PoolingLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_ave_pool_backward = program.get_kernel( CL_KERNEL_SELECT("ave_pool_backward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + viennacl::ocl::enqueue( oclk_ave_pool_backward(count, - WrapHandle((cl_mem) top_diff, &ctx), + WrapHandle(buf_top.memobj, &ctx), top[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, - WrapHandle((cl_mem) bottom_diff, &ctx)), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); } break; @@ -1311,14 +1383,20 @@ void PoolingLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_sto_pool_backward = program.get_kernel( CL_KERNEL_SELECT("sto_pool_backward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_rand = + clState.get_buffer_mem(rand_idx_.gpu_data()); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + viennacl::ocl::enqueue( oclk_sto_pool_backward( - count, WrapHandle((cl_mem) (rand_idx_.gpu_data()), &ctx), - WrapHandle((cl_mem) top_diff, &ctx), top[0]->shape(0), + count, WrapHandle(buf_rand.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx), top[0]->shape(0), channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, - WrapHandle((cl_mem) bottom_diff, &ctx)), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); } break; @@ -1337,20 +1415,38 @@ void PoolingLayer::Backward_gpu(const vector*>& top, } viennacl::ocl::kernel &oclk_max_pool_backward = program.get_kernel( CL_KERNEL_SELECT("max_pool_backward_nd")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_mask = clState.get_buffer_mem(mask); + ClMemOff buf_top_mask = clState.get_buffer_mem(top_mask); + ClMemOff buf_size = clState.get_buffer_mem(size_.gpu_data()); + ClMemOff buf_pooled = + clState.get_buffer_mem(pooled_size_.gpu_data()); + ClMemOff buf_kernel = + clState.get_buffer_mem(kernel_shape_.gpu_data()); + ClMemOff buf_ext_kernel = + clState.get_buffer_mem(ext_kernel_shape_.gpu_data()); + ClMemOff buf_stride = + clState.get_buffer_mem(stride_.gpu_data()); + ClMemOff buf_dilation = + clState.get_buffer_mem(dilation_.gpu_data()); + ClMemOff buf_pad = clState.get_buffer_mem(pad_.gpu_data()); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + viennacl::ocl::enqueue( oclk_max_pool_backward( count, num_spatial_axes_, - WrapHandle((cl_mem) top_diff, &ctx), - mask == NULL ? 0 : 1, WrapHandle((cl_mem) mask, &ctx), - WrapHandle((cl_mem) top_mask, &ctx), channels_, - WrapHandle((cl_mem) (size_.gpu_data()), &ctx), - WrapHandle((cl_mem) (pooled_size_.gpu_data()), &ctx), - WrapHandle((cl_mem) (kernel_shape_.gpu_data()), &ctx), - WrapHandle((cl_mem) (ext_kernel_shape_.gpu_data()), &ctx), - WrapHandle((cl_mem) (stride_.gpu_data()), &ctx), - WrapHandle((cl_mem) (dilation_.gpu_data()), &ctx), - WrapHandle((cl_mem) (pad_.gpu_data()), &ctx), - WrapHandle((cl_mem) bottom_diff, &ctx)), + WrapHandle(buf_top.memobj, &ctx), + mask == NULL ? 0 : 1, WrapHandle(buf_mask.memobj, &ctx), + WrapHandle(buf_top_mask.memobj, &ctx), channels_, + WrapHandle(buf_size.memobj, &ctx), + WrapHandle(buf_pooled.memobj, &ctx), + WrapHandle(buf_kernel.memobj, &ctx), + WrapHandle(buf_ext_kernel.memobj, &ctx), + WrapHandle(buf_stride.memobj, &ctx), + WrapHandle(buf_dilation.memobj, &ctx), + WrapHandle(buf_pad.memobj, &ctx), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); } break; diff --git a/src/caffe/layers/power_layer.cu b/src/caffe/layers/power_layer.cu index 73396ac8096..26ca2ae8063 100644 --- a/src/caffe/layers/power_layer.cu +++ b/src/caffe/layers/power_layer.cu @@ -5,7 +5,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -16,54 +15,21 @@ void PowerLayer::Forward_gpu(const vector*>& bottom, Dtype* top_data = top[0]->mutable_gpu_data(); const int_tp count = bottom[0]->count(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - // Special case where we can ignore the input: scale or power is 0. - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - caffe_gpu_set(count, value, top_data); - return; - } - const Dtype* bottom_data = bottom[0]->gpu_data(); - caffe_copy(count, bottom_data, top_data); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, top_data); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, top_data); - } - if (power_ != Dtype(1)) { - caffe_gpu_powx(count, top_data, power_, top_data); - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - greentea_gpu_set(this->device_->id(), count, value, - (cl_mem) top_data, 0); - return; - } - const Dtype* bottom_data = bottom[0]->gpu_data(); - greentea_copy(count, (cl_mem) bottom_data, 0, (cl_mem) top_data, 0, - &ctx); - if (scale_ != Dtype(1)) { - greentea_gpu_scal(this->device_->id(), count, scale_, - (cl_mem) top_data, 0); - } - if (shift_ != Dtype(0)) { - greentea_gpu_add_scalar(this->device_->id(), count, shift_, - (cl_mem) top_data, 0); - } - if (power_ != Dtype(1)) { - greentea_gpu_powx(this->device_->id(), count, - (cl_mem) top_data, 0, power_, (cl_mem) top_data, - 0); - } -#endif // USE_GREENTEA + if (diff_scale_ == Dtype(0)) { + Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); + caffe_gpu_set(count, value, top_data); + return; + } + const Dtype* bottom_data = bottom[0]->gpu_data(); + caffe_copy(count, bottom_data, top_data); + if (scale_ != Dtype(1)) { + caffe_gpu_scal(count, scale_, top_data); + } + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, shift_, top_data); + } + if (power_ != Dtype(1)) { + caffe_gpu_powx(count, top_data, power_, top_data); } } @@ -76,109 +42,45 @@ void PowerLayer::Backward_gpu(const vector*>& top, const int_tp count = bottom[0]->count(); const Dtype* top_diff = top[0]->gpu_diff(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - caffe_gpu_set(count, diff_scale_, bottom_diff); - } else { - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0), - bottom_diff); - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - caffe_gpu_scal(count, power_, bottom_diff); - } else { - caffe_copy(count, bottom_data, bottom_diff); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, bottom_diff); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, bottom_diff); - } - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); - if (diff_scale_ != Dtype(1)) { - caffe_gpu_scal(count, diff_scale_, bottom_diff); - } - } - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); -#endif // USE_CUDA + if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { + caffe_gpu_set(count, diff_scale_, bottom_diff); } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - greentea_gpu_set(this->device_->id(), count, diff_scale_, - (cl_mem) bottom_diff, 0); + const Dtype* bottom_data = bottom[0]->gpu_data(); + // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) + // = diff_scale * y / (shift + scale * x) + if (power_ == Dtype(2)) { + // Special case for y = (shift + scale * x)^2 + // -> dy/dx = 2 * scale * (shift + scale * x) + // = diff_scale * shift + diff_scale * scale * x + caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0), + bottom_diff); + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); + } + } else if (shift_ == Dtype(0)) { + // Special case for y = (scale * x)^power + // -> dy/dx = scale * power * (scale * x)^(power - 1) + // = scale * power * (scale * x)^power * (scale * x)^(-1) + // = power * y / x + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_div(count, top_data, bottom_data, bottom_diff); + caffe_gpu_scal(count, power_, bottom_diff); } else { - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - greentea_gpu_axpby(this->device_->id(), count, - diff_scale_ * scale_, (cl_mem) bottom_data, 0, - Dtype(0), (cl_mem) bottom_diff, 0); - if (shift_ != Dtype(0)) { - greentea_gpu_add_scalar(this->device_->id(), count, - diff_scale_ * shift_, (cl_mem) bottom_diff, - 0); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->gpu_data(); - greentea_gpu_div(this->device_->id(), count, - (cl_mem) top_data, 0, (cl_mem) bottom_data, 0, - (cl_mem) bottom_diff, 0); - greentea_gpu_scal(this->device_->id(), count, power_, - (cl_mem) bottom_diff, 0); - } else { - greentea_copy(count, (cl_mem) bottom_data, 0, - (cl_mem) bottom_diff, 0, &ctx); - if (scale_ != Dtype(1)) { - greentea_gpu_scal(this->device_->id(), count, scale_, - (cl_mem) bottom_diff, 0); - } - if (shift_ != Dtype(0)) { - greentea_gpu_add_scalar(this->device_->id(), count, shift_, - (cl_mem) bottom_diff, 0); - } - const Dtype* top_data = top[0]->gpu_data(); - greentea_gpu_div(this->device_->id(), count, - (cl_mem) top_data, 0, (cl_mem) bottom_diff, 0, - (cl_mem) bottom_diff, 0); - if (diff_scale_ != Dtype(1)) { - greentea_gpu_scal(this->device_->id(), count, diff_scale_, - (cl_mem) bottom_diff, 0); - } + caffe_copy(count, bottom_data, bottom_diff); + if (scale_ != Dtype(1)) { + caffe_gpu_scal(count, scale_, bottom_diff); + } + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, shift_, bottom_diff); + } + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); + if (diff_scale_ != Dtype(1)) { + caffe_gpu_scal(count, diff_scale_, bottom_diff); } } - greentea_gpu_mul(this->device_->id(), count, - (cl_mem) top_diff, 0, (cl_mem) bottom_diff, 0, - (cl_mem) bottom_diff, 0); -#endif // USE_GREENTEA } + caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); } } diff --git a/src/caffe/layers/prelu_layer.cu b/src/caffe/layers/prelu_layer.cu index 2ddec5574bd..e91fd74e274 100644 --- a/src/caffe/layers/prelu_layer.cu +++ b/src/caffe/layers/prelu_layer.cu @@ -6,7 +6,7 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/math_functions.hpp" #endif namespace caffe { @@ -79,24 +79,28 @@ void PReLULayer::Forward_gpu(const vector*>& bottom, #endif // USE_CUDA } else { #ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - viennacl::ocl::program &program = this->device_->program(); - - if (top[0] == bottom[0]) { - greentea_copy(count, (cl_mem) bottom_data, 0, - (cl_mem) (bottom_memory_.mutable_gpu_data()), 0, - &ctx); - } - - viennacl::ocl::kernel &oclk_prelu = program.get_kernel( - CL_KERNEL_SELECT("prelu_forward")); - viennacl::ocl::enqueue( - oclk_prelu(count, channels, dim, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) top_data, &ctx), - WrapHandle((cl_mem) slope_data, &ctx), div_factor), - ctx.get_queue()); + // For in-place computation + if (top[0] == bottom[0]) { + caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); + } + // NOLINT_NEXT_LINE(whitespace/operators) + viennacl::ocl::context &ctx = viennacl::ocl::get_context( + this->device_->id()); + viennacl::ocl::program &program = this->device_->program(); + viennacl::ocl::kernel &oclk_prelu = program.get_kernel( + CL_KERNEL_SELECT("prelu_forward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_slope = clState.get_buffer_mem(slope_data); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + + viennacl::ocl::enqueue( + oclk_prelu(count, channels, dim, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_slope.memobj, &ctx), div_factor), + ctx.get_queue()); #endif // USE_GREENTEA } } @@ -176,26 +180,28 @@ void PReLULayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_prelu = program.get_kernel( CL_KERNEL_SELECT("prelu_param_backward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_backward_buff = + clState.get_buffer_mem(backward_buff_.mutable_gpu_diff()); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + viennacl::ocl::enqueue( oclk_prelu(cdim, bottom[0]->num(), top[0]->offset(1), - WrapHandle((cl_mem)top_diff, &ctx), - WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) (backward_buff_.mutable_gpu_diff()), &ctx)), + WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_backward_buff.memobj, &ctx)), ctx.get_queue()); if (channel_shared_) { Dtype dsum; - greentea_gpu_dot(this->device_->id(), channels * dim, - (cl_mem) (backward_buff_.gpu_diff()), 0, - (cl_mem) (multiplier_.gpu_data()), 0, &dsum); - greentea_gpu_add_scalar(this->device_->id(), - this->blobs_[0]->count(), Dtype(dsum), - (cl_mem) slope_diff, 0); + caffe_gpu_dot(channels * dim, backward_buff_.gpu_diff(), + multiplier_.gpu_data(), &dsum); + caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); } else { - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, channels, - dim, 1., (cl_mem) (backward_buff_.gpu_diff()), - 0, (cl_mem) (multiplier_.gpu_data()), 0, 1., - (cl_mem) slope_diff, 0); + caffe_gpu_gemv(CblasNoTrans, channels, dim, 1., + backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., + slope_diff); } } // Propagate to bottom @@ -205,11 +211,18 @@ void PReLULayer::Backward_gpu(const vector*>& top, int_tp div_factor = channel_shared_ ? channels : 1; viennacl::ocl::kernel &oclk_prelu = program.get_kernel( CL_KERNEL_SELECT("prelu_backward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_slope = clState.get_buffer_mem(slope_data); + ClMemOff buf_bottom_data = clState.get_buffer_mem(bottom_data); + ClMemOff buf_bottom_diff = clState.get_buffer_mem(bottom_diff); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + viennacl::ocl::enqueue( - oclk_prelu(count, channels, dim, WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) bottom_diff, &ctx), - WrapHandle((cl_mem) slope_data, &ctx), div_factor), + oclk_prelu(count, channels, dim, WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_bottom_data.memobj, &ctx), + WrapHandle(buf_bottom_diff.memobj, &ctx), + WrapHandle(buf_slope.memobj, &ctx), div_factor), ctx.get_queue()); } #endif // USE_GREENTEA diff --git a/src/caffe/layers/recurrent_layer.cu b/src/caffe/layers/recurrent_layer.cu index e0fb0219a55..4dd2b0e2165 100644 --- a/src/caffe/layers/recurrent_layer.cu +++ b/src/caffe/layers/recurrent_layer.cu @@ -7,10 +7,6 @@ #include "caffe/layers/recurrent_layer.hpp" #include "caffe/util/math_functions.hpp" -#ifdef USE_GREENTEA -#include "caffe/greentea/greentea_math_functions.hpp" -#endif // USE_GREENTEA - namespace caffe { template @@ -29,18 +25,7 @@ void RecurrentLayer::Forward_gpu(const vector*>& bottom, DCHECK_EQ(count, recur_output_blobs_[i]->count()); const Dtype* timestep_T_data = recur_output_blobs_[i]->gpu_data(); Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_gpu_data(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_copy(count, timestep_T_data, timestep_0_data); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - greentea_copy(count, (cl_mem)timestep_T_data, 0, - (cl_mem)timestep_0_data, 0, &ctx); -#endif // USE_GREENTEA - } + caffe_copy(count, timestep_T_data, timestep_0_data); } } diff --git a/src/caffe/layers/reduction_layer.cu b/src/caffe/layers/reduction_layer.cu index d3376ef0aa5..a2c06ca0918 100644 --- a/src/caffe/layers/reduction_layer.cu +++ b/src/caffe/layers/reduction_layer.cu @@ -14,79 +14,36 @@ void ReductionLayer::Forward_gpu(const vector*>& bottom, int_tp bottom_data_off = 0; int_tp top_data_off = 0; - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (sum_multiplier_.count() > 0) { - mult_data = sum_multiplier_.gpu_data(); - } - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int_tp i = 0; i < num_; ++i) { - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_dot(dim_, mult_data, bottom_data + bottom_data_off, - top_data + top_data_off); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_asum(dim_, bottom_data + bottom_data_off, - top_data + top_data_off); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_dot(dim_, bottom_data + bottom_data_off, - bottom_data + bottom_data_off, top_data + top_data_off); - break; - default: - LOG(FATAL)<< "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data_off += dim_; - ++top_data_off; - } - if (coeff_ != Dtype(1)) { - // Reset the top_data pointer. - top_data = top[0]->mutable_gpu_data(); - caffe_gpu_scal(num_, coeff_, top_data); - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - if (sum_multiplier_.count() > 0) { - mult_data = sum_multiplier_.gpu_data(); - } - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int_tp i = 0; i < num_; ++i) { - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - greentea_gpu_dot(this->device_->id(), dim_, - (cl_mem) mult_data, 0, (cl_mem) bottom_data, - bottom_data_off, top_data + top_data_off); - break; - case ReductionParameter_ReductionOp_ASUM: - greentea_gpu_asum(this->device_->id(), dim_, - (cl_mem) bottom_data, bottom_data_off, - top_data + top_data_off); - break; - case ReductionParameter_ReductionOp_SUMSQ: - greentea_gpu_dot(this->device_->id(), dim_, - (cl_mem) bottom_data, bottom_data_off, - (cl_mem) bottom_data, bottom_data_off, - top_data + top_data_off); - break; - default: - LOG(FATAL)<< "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data_off += dim_; - ++top_data_off; - } - if (coeff_ != Dtype(1)) { - // Reset the top_data pointer. - top_data = top[0]->mutable_gpu_data(); - greentea_gpu_scal(this->device_->id(), num_, coeff_, - (cl_mem) top_data, 0); + if (sum_multiplier_.count() > 0) { + mult_data = sum_multiplier_.gpu_data(); + } + Dtype* top_data = top[0]->mutable_cpu_data(); + for (int_tp i = 0; i < num_; ++i) { + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_gpu_dot(dim_, mult_data, bottom_data + bottom_data_off, + top_data + top_data_off); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_gpu_asum(dim_, bottom_data + bottom_data_off, + top_data + top_data_off); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_gpu_dot(dim_, bottom_data + bottom_data_off, + bottom_data + bottom_data_off, top_data + top_data_off); + break; + default: + LOG(FATAL)<< "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); } -#endif // USE_GREENTEA + bottom_data_off += dim_; + ++top_data_off; + } + if (coeff_ != Dtype(1)) { + // Reset the top_data pointer. + top_data = top[0]->mutable_gpu_data(); + caffe_gpu_scal(num_, coeff_, top_data); } } @@ -120,67 +77,29 @@ void ReductionLayer::Backward_gpu(const vector*>& top, int_tp bottom_diff_off = 0; int_tp top_diff_off = 0; - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - for (int_tp i = 0; i < num_; ++i) { - const Dtype bottom_coeff = (*(top_diff + top_diff_off)) * coeff_; - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_set(dim_, bottom_coeff, bottom_diff + bottom_diff_off); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_sign(dim_, bottom_data + bottom_data_off, - bottom_diff + bottom_diff_off); - caffe_gpu_scal(dim_, bottom_coeff, bottom_diff + bottom_diff_off); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data + bottom_data_off, - bottom_diff + bottom_diff_off); - break; - default: - LOG(FATAL)<< "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data_off += dim_; - bottom_diff_off += dim_; - ++top_diff_off; - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - for (int_tp i = 0; i < num_; ++i) { - const Dtype bottom_coeff = (*(top_diff + top_diff_off)) * coeff_; - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - greentea_gpu_set(this->device_->id(), dim_, - bottom_coeff, (cl_mem) bottom_diff, - bottom_diff_off); - break; - case ReductionParameter_ReductionOp_ASUM: - greentea_gpu_sign(this->device_->id(), dim_, - (cl_mem) bottom_data, bottom_data_off, - (cl_mem) bottom_diff, bottom_diff_off); - greentea_gpu_scal(this->device_->id(), dim_, - bottom_coeff, (cl_mem) bottom_diff, - bottom_diff_off); - break; - case ReductionParameter_ReductionOp_SUMSQ: - greentea_gpu_scale(this->device_->id(), dim_, - 2 * bottom_coeff, (cl_mem) bottom_data, - bottom_data_off, (cl_mem) bottom_diff, - bottom_diff_off); - break; - default: - LOG(FATAL)<< "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data_off += dim_; - bottom_diff_off += dim_; - ++top_diff_off; + for (int_tp i = 0; i < num_; ++i) { + const Dtype bottom_coeff = (*(top_diff + top_diff_off)) * coeff_; + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_gpu_set(dim_, bottom_coeff, bottom_diff + bottom_diff_off); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_gpu_sign(dim_, bottom_data + bottom_data_off, + bottom_diff + bottom_diff_off); + caffe_gpu_scal(dim_, bottom_coeff, bottom_diff + bottom_diff_off); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data + bottom_data_off, + bottom_diff + bottom_diff_off); + break; + default: + LOG(FATAL)<< "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); } -#endif // USE_GREENTEA + bottom_data_off += dim_; + bottom_diff_off += dim_; + ++top_diff_off; } } diff --git a/src/caffe/layers/relu_layer.cu b/src/caffe/layers/relu_layer.cu index 4afd35ed6a3..48bb63f0e64 100644 --- a/src/caffe/layers/relu_layer.cu +++ b/src/caffe/layers/relu_layer.cu @@ -5,7 +5,7 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/math_functions.hpp" #endif namespace caffe { @@ -42,9 +42,12 @@ void ReLULayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::program &program = this->device_->program(); viennacl::ocl::kernel &oclk_relu_forward = program.get_kernel( CL_KERNEL_SELECT("relu_forward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); viennacl::ocl::enqueue( - oclk_relu_forward(count, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) top_data, &ctx), negative_slope), + oclk_relu_forward(count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx), negative_slope), ctx.get_queue()); ctx.get_queue().finish(); #endif // USE_GREENTEA @@ -93,10 +96,15 @@ void ReLULayer::Backward_gpu(const vector*>& top, viennacl::ocl::program &program = this->device_->program(); viennacl::ocl::kernel &oclk_relu_backward = program.get_kernel( CL_KERNEL_SELECT("relu_backward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + ClMemOff buf_bottomdata = clState.get_buffer_mem(bottom_data); + ClMemOff buf_bottomdiff = clState.get_buffer_mem(bottom_diff); + viennacl::ocl::enqueue( - oclk_relu_backward(count, WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) bottom_diff, &ctx), + oclk_relu_backward(count, WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_bottomdata.memobj, &ctx), + WrapHandle(buf_bottomdiff.memobj, &ctx), negative_slope), ctx.get_queue()); ctx.get_queue().finish(); diff --git a/src/caffe/layers/scale_layer.cu b/src/caffe/layers/scale_layer.cu index 02c10dbf8e6..4872900d53a 100644 --- a/src/caffe/layers/scale_layer.cu +++ b/src/caffe/layers/scale_layer.cu @@ -7,7 +7,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -40,61 +39,44 @@ void ScaleLayer::Forward_gpu(const vector*>& bottom, const int_tp count = top[0]->count(); const Dtype* bottom_data = bottom[0]->gpu_data(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (bottom[0] == top[0]) { - caffe_copy(bottom[0]->count(), bottom[0]->gpu_data(), - temp_.mutable_gpu_data()); - } - const Dtype* scale_data = ( - (bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (bias_layer_) { - const Dtype* bias_data = this->blobs_[bias_param_id_]->gpu_data(); - ScaleBiasForward // NOLINT_NEXT_LINE(whitespace/operators) - CUDA_KERNEL(CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS)( - count, bottom_data, scale_data, bias_data, scale_dim_, inner_dim_, - top_data); - } else { - ScaleForward // NOLINT_NEXT_LINE(whitespace/operators) - CUDA_KERNEL(CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS)( - count, bottom_data, scale_data, scale_dim_, inner_dim_, top_data); - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( + if (bottom[0] == top[0]) { + caffe_copy(bottom[0]->count(), bottom[0]->gpu_data(), + temp_.mutable_gpu_data()); + } + const Dtype* scale_data = ( + (bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_scale = clState.get_buffer_mem(scale_data); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + + viennacl::ocl::context &ctx = viennacl::ocl::get_context( this->device_->id()); - viennacl::ocl::program &program = this->device_->program(); + viennacl::ocl::program &program = this->device_->program(); - if (bottom[0] == top[0]) { - greentea_copy(bottom[0]->count(), (cl_mem) (bottom[0]->gpu_data()), - 0, (cl_mem) (temp_.mutable_gpu_data()), 0, &ctx); - } - const Dtype* scale_data = ( - (bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (bias_layer_) { - const Dtype* bias_data = this->blobs_[bias_param_id_]->gpu_data(); - viennacl::ocl::kernel &oclk_scale_bias_forward = program.get_kernel( - CL_KERNEL_SELECT("scale_bias_forward")); - viennacl::ocl::enqueue( - oclk_scale_bias_forward(count, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) scale_data, &ctx), - WrapHandle((cl_mem) bias_data, &ctx), - scale_dim_, inner_dim_, - WrapHandle((cl_mem) top_data, &ctx)), - ctx.get_queue()); - } else { - viennacl::ocl::kernel &oclk_scale_forward = program.get_kernel( - CL_KERNEL_SELECT("scale_forward")); - viennacl::ocl::enqueue( - oclk_scale_forward(count, WrapHandle((cl_mem)bottom_data, &ctx), - WrapHandle((cl_mem)scale_data, &ctx), scale_dim_, - inner_dim_, WrapHandle((cl_mem)top_data, &ctx)), - ctx.get_queue()); - } -#endif // USE_GREENTEA + if (bias_layer_) { + const Dtype* bias_data = this->blobs_[bias_param_id_]->gpu_data(); + viennacl::ocl::kernel &oclk_scale_bias_forward = program.get_kernel( + CL_KERNEL_SELECT("scale_bias_forward")); + + ClMemOff buf_bias = clState.get_buffer_mem(bias_data); + viennacl::ocl::enqueue( + oclk_scale_bias_forward(count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_scale.memobj, &ctx), + WrapHandle(buf_bias.memobj, &ctx), + scale_dim_, inner_dim_, + WrapHandle(buf_top.memobj, &ctx)), + ctx.get_queue()); + } else { + viennacl::ocl::kernel &oclk_scale_forward = program.get_kernel( + CL_KERNEL_SELECT("scale_forward")); + viennacl::ocl::enqueue( + oclk_scale_forward(count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_scale.memobj, &ctx), scale_dim_, + inner_dim_, WrapHandle(buf_top.memobj, &ctx)), + ctx.get_queue()); } } @@ -109,161 +91,83 @@ void ScaleLayer::Backward_gpu(const vector*>& top, const bool scale_param = (bottom.size() == 1); Blob* scale = scale_param ? this->blobs_[0].get() : bottom[1]; - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if ((!scale_param && propagate_down[1]) - || (scale_param && this->param_propagate_down_[0])) { - const Dtype* top_diff = top[0]->gpu_diff(); - const bool in_place = (bottom[0] == top[0]); - const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->gpu_data(); - const bool is_eltwise = (bottom[0]->count() == scale->count()); - Dtype* product = ( - is_eltwise ? - scale->mutable_gpu_diff() : - (in_place ? - temp_.mutable_gpu_data() : bottom[0]->mutable_gpu_diff())); - caffe_gpu_mul(top[0]->count(), top_diff, bottom_data, product); - if (!is_eltwise) { - Dtype* sum_result = NULL; - if (inner_dim_ == 1) { - sum_result = product; - } else if (sum_result_.count() == 1) { - const Dtype* sum_mult = sum_multiplier_.gpu_data(); - Dtype* scale_diff = scale->mutable_cpu_diff(); - if (scale_param) { - Dtype result; - caffe_gpu_dot(inner_dim_, product, sum_mult, &result); - *scale_diff += result; - } else { - caffe_gpu_dot(inner_dim_, product, sum_mult, scale_diff); - } + viennacl::ocl::context &ctx = viennacl::ocl::get_context( + this->device_->id()); + viennacl::ocl::program &program = this->device_->program(); + + if ((!scale_param && propagate_down[1]) + || (scale_param && this->param_propagate_down_[0])) { + const Dtype* top_diff = top[0]->gpu_diff(); + const bool in_place = (bottom[0] == top[0]); + const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->gpu_data(); + const bool is_eltwise = (bottom[0]->count() == scale->count()); + Dtype* product = ( + is_eltwise ? + scale->mutable_gpu_diff() : + (in_place ? + temp_.mutable_gpu_data() : bottom[0]->mutable_gpu_diff())); + caffe_gpu_mul(top[0]->count(), top_diff, bottom_data, product); + if (!is_eltwise) { + Dtype* sum_result = NULL; + if (inner_dim_ == 1) { + sum_result = product; + } else if (sum_result_.count() == 1) { + const Dtype* sum_mult = sum_multiplier_.gpu_data(); + Dtype* scale_diff = scale->mutable_cpu_diff(); + if (scale_param) { + Dtype result; + caffe_gpu_dot(inner_dim_, product, sum_mult, &result); + *scale_diff += result; } else { - const Dtype* sum_mult = sum_multiplier_.gpu_data(); - sum_result = - (outer_dim_ == 1) ? - scale->mutable_gpu_diff() : sum_result_.mutable_gpu_data(); - caffe_gpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_, - Dtype(1), product, sum_mult, Dtype(0), sum_result); - } - if (outer_dim_ != 1) { - const Dtype* sum_mult = sum_multiplier_.gpu_data(); - if (scale_dim_ == 1) { - Dtype* scale_diff = scale->mutable_cpu_diff(); - if (scale_param) { - Dtype result; - caffe_gpu_dot(outer_dim_, sum_mult, sum_result, &result); - *scale_diff += result; - } else { - caffe_gpu_dot(outer_dim_, sum_mult, sum_result, scale_diff); - } - } else { - Dtype* scale_diff = scale->mutable_gpu_diff(); - caffe_gpu_gemv(CblasTrans, outer_dim_, scale_dim_, Dtype(1), - sum_result, sum_mult, Dtype(scale_param), - scale_diff); - } + caffe_gpu_dot(inner_dim_, product, sum_mult, scale_diff); } + } else { + const Dtype* sum_mult = sum_multiplier_.gpu_data(); + sum_result = + (outer_dim_ == 1) ? + scale->mutable_gpu_diff() : sum_result_.mutable_gpu_data(); + caffe_gpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_, + Dtype(1), product, sum_mult, Dtype(0), sum_result); } - } - if (propagate_down[0]) { - const int_tp count = top[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* scale_data = scale->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - ScaleForward // NOLINT_NEXT_LINE(whitespace/operators) - CUDA_KERNEL(CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS)( - count, top_diff, scale_data, scale_dim_, inner_dim_, bottom_diff); - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - viennacl::ocl::program &program = this->device_->program(); - - if ((!scale_param && propagate_down[1]) - || (scale_param && this->param_propagate_down_[0])) { - const Dtype* top_diff = top[0]->gpu_diff(); - const bool in_place = (bottom[0] == top[0]); - const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->gpu_data(); - const bool is_eltwise = (bottom[0]->count() == scale->count()); - Dtype* product = ( - is_eltwise ? - scale->mutable_gpu_diff() : - (in_place ? - temp_.mutable_gpu_data() : bottom[0]->mutable_gpu_diff())); - greentea_gpu_mul(this->device_->id(), top[0]->count(), - (cl_mem) top_diff, 0, (cl_mem) bottom_data, 0, - (cl_mem) product, 0); - if (!is_eltwise) { - Dtype* sum_result = NULL; - if (inner_dim_ == 1) { - sum_result = product; - } else if (sum_result_.count() == 1) { - const Dtype* sum_mult = sum_multiplier_.gpu_data(); + if (outer_dim_ != 1) { + const Dtype* sum_mult = sum_multiplier_.gpu_data(); + if (scale_dim_ == 1) { Dtype* scale_diff = scale->mutable_cpu_diff(); if (scale_param) { Dtype result; - greentea_gpu_dot(this->device_->id(), inner_dim_, - (cl_mem) product, 0, (cl_mem) sum_mult, 0, - &result); + caffe_gpu_dot(outer_dim_, sum_mult, sum_result, &result); *scale_diff += result; } else { - greentea_gpu_dot(this->device_->id(), inner_dim_, - (cl_mem) product, 0, (cl_mem) sum_mult, 0, - scale_diff); + caffe_gpu_dot(outer_dim_, sum_mult, sum_result, scale_diff); } } else { - const Dtype* sum_mult = sum_multiplier_.gpu_data(); - sum_result = - (outer_dim_ == 1) ? - scale->mutable_gpu_diff() : sum_result_.mutable_gpu_data(); - greentea_gpu_gemv(this->device_->id(), CblasNoTrans, - sum_result_.count(), inner_dim_, Dtype(1), - (cl_mem) product, 0, (cl_mem) sum_mult, 0, - Dtype(0), (cl_mem) sum_result, 0); - } - if (outer_dim_ != 1) { - const Dtype* sum_mult = sum_multiplier_.gpu_data(); - if (scale_dim_ == 1) { - Dtype* scale_diff = scale->mutable_cpu_diff(); - if (scale_param) { - Dtype result; - greentea_gpu_dot(this->device_->id(), outer_dim_, - (cl_mem) sum_mult, 0, (cl_mem) sum_result, - 0, &result); - *scale_diff += result; - } else { - greentea_gpu_dot(this->device_->id(), outer_dim_, - (cl_mem) sum_mult, 0, (cl_mem) sum_result, - 0, scale_diff); - } - } else { - Dtype* scale_diff = scale->mutable_gpu_diff(); - greentea_gpu_gemv(this->device_->id(), CblasTrans, - outer_dim_, scale_dim_, Dtype(1), - (cl_mem) sum_result, 0, (cl_mem) sum_mult, - 0, Dtype(scale_param), (cl_mem) scale_diff, - 0); - } + Dtype* scale_diff = scale->mutable_gpu_diff(); + caffe_gpu_gemv(CblasTrans, outer_dim_, scale_dim_, Dtype(1), + sum_result, sum_mult, Dtype(scale_param), scale_diff); } } } - if (propagate_down[0]) { - const int_tp count = top[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* scale_data = scale->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - viennacl::ocl::kernel &oclk_scale_forward = program.get_kernel( - CL_KERNEL_SELECT("scale_forward")); - viennacl::ocl::enqueue( - oclk_scale_forward(count, WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) scale_data, &ctx), scale_dim_, - inner_dim_, - WrapHandle((cl_mem) bottom_diff, &ctx)), - ctx.get_queue()); - } -#endif // USE_GREENTEA + } + if (propagate_down[0]) { + const int_tp count = top[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* scale_data = scale->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + + viennacl::ocl::kernel &oclk_scale_forward = program.get_kernel( + CL_KERNEL_SELECT("scale_forward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom_diff = clState.get_buffer_mem(bottom_diff); + ClMemOff buf_top_diff = clState.get_buffer_mem(top_diff); + ClMemOff buf_scale = clState.get_buffer_mem(scale_data); + + viennacl::ocl::enqueue( + oclk_scale_forward(count, WrapHandle(buf_top_diff.memobj, &ctx), + WrapHandle(buf_scale.memobj, &ctx), scale_dim_, + inner_dim_, + WrapHandle(buf_bottom_diff.memobj, &ctx)), + ctx.get_queue()); } } diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu index 7e33af2081d..1cd62003a2a 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu @@ -5,7 +5,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -25,32 +24,12 @@ void SigmoidCrossEntropyLossLayer::Backward_gpu( const Dtype* target = bottom[1]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - // First, compute the diff - caffe_copy(count, sigmoid_output_data, bottom_diff); - caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); - // Scale down gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - caffe_gpu_scal(count, loss_weight / num, bottom_diff); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - // First, compute the diff - greentea_copy(count, (cl_mem)sigmoid_output_data, 0, - (cl_mem)bottom_diff, 0, &ctx); - greentea_gpu_axpy(this->device_->id(), count, - Dtype(-1), (cl_mem)target, 0, - (cl_mem)bottom_diff, 0); - // Scale down gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - greentea_gpu_scal(this->device_->id(), count, loss_weight / num, - (cl_mem)bottom_diff, 0); -#endif // USE_GREENTEA - } + // First, compute the diff + caffe_copy(count, sigmoid_output_data, bottom_diff); + caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); + // Scale down gradient + const Dtype loss_weight = top[0]->cpu_diff()[0]; + caffe_gpu_scal(count, loss_weight / num, bottom_diff); } } diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu index 2d54e4f71e4..8e85858da5c 100644 --- a/src/caffe/layers/sigmoid_layer.cu +++ b/src/caffe/layers/sigmoid_layer.cu @@ -37,9 +37,13 @@ void SigmoidLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::kernel &oclk_sigmoid = program.get_kernel( CL_KERNEL_SELECT("sigmoid_forward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_sigmoid(count, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) top_data, &ctx)), + oclk_sigmoid(count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -85,13 +89,17 @@ void SigmoidLayer::Backward_gpu(const vector*>& top, viennacl::ocl::context &ctx = viennacl::ocl::get_context( this->device_->id()); viennacl::ocl::program &program = this->device_->program(); - viennacl::ocl::kernel &oclk_sigmoid = program.get_kernel( CL_KERNEL_SELECT("sigmoid_backward")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_topdiff = clState.get_buffer_mem(top_diff); + ClMemOff buf_topdata = clState.get_buffer_mem(top_data); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + viennacl::ocl::enqueue( - oclk_sigmoid(count, WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) top_data, &ctx), - WrapHandle((cl_mem) bottom_diff, &ctx)), + oclk_sigmoid(count, WrapHandle(buf_topdiff.memobj, &ctx), + WrapHandle(buf_topdata.memobj, &ctx), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/layers/silence_layer.cu b/src/caffe/layers/silence_layer.cu index c7b5b3e261d..2815b4cf464 100644 --- a/src/caffe/layers/silence_layer.cu +++ b/src/caffe/layers/silence_layer.cu @@ -5,7 +5,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -22,27 +21,8 @@ void SilenceLayer::Backward_gpu(const vector*>& top, const vector*>& bottom) { for (int_tp i = 0; i < bottom.size(); ++i) { if (propagate_down[i]) { - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_gpu_diff()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - viennacl::ocl::program &program = this->device_->program(); - - viennacl::ocl::kernel &oclk_gpu_set = program.get_kernel( - CL_KERNEL_SELECT("gpu_set")); - viennacl::ocl::enqueue( - oclk_gpu_set( - bottom[i]->count(), Dtype(0), - WrapHandle((cl_mem) bottom[i]->mutable_gpu_diff(), &ctx)), - ctx.get_queue()); - ctx.get_queue().finish(); -#endif - } + caffe_gpu_set(bottom[i]->count(), Dtype(0), + bottom[i]->mutable_gpu_diff()); } } } diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/slice_layer.cu index fe4a334ce02..d1d8e3571eb 100644 --- a/src/caffe/layers/slice_layer.cu +++ b/src/caffe/layers/slice_layer.cu @@ -57,11 +57,15 @@ void SliceLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::kernel &oclk_slice = program.get_kernel( CL_KERNEL_SELECT("slice")); + + cl_mem bottom_mem = Caffe::cl_state().get_buffer_mem(bottom_data).memobj; + cl_mem top_mem = Caffe::cl_state().get_buffer_mem(top_data).memobj; + viennacl::ocl::enqueue( - oclk_slice(nthreads, WrapHandle((cl_mem) bottom_data, &ctx), + oclk_slice(nthreads, WrapHandle(bottom_mem, &ctx), kForward ? 1 : 0, num_slices_, slice_size_, bottom_slice_axis, top_slice_axis, offset_slice_axis, - WrapHandle((cl_mem) top_data, &ctx)), + WrapHandle(top_mem, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -99,11 +103,15 @@ void SliceLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_slice = program.get_kernel( CL_KERNEL_SELECT("slice")); + + cl_mem bottom_mem = Caffe::cl_state().get_buffer_mem(bottom_diff).memobj; + cl_mem top_mem = Caffe::cl_state().get_buffer_mem(top_diff).memobj; + viennacl::ocl::enqueue( - oclk_slice(nthreads, WrapHandle((cl_mem) top_diff, &ctx), + oclk_slice(nthreads, WrapHandle(top_mem, &ctx), kForward ? 1 : 0, num_slices_, slice_size_, bottom_slice_axis, top_slice_axis, offset_slice_axis, - WrapHandle((cl_mem) bottom_diff, &ctx)), + WrapHandle(bottom_mem, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/layers/softmax_layer.cu b/src/caffe/layers/softmax_layer.cu index 4d701c8fe07..06f5d1d47e0 100644 --- a/src/caffe/layers/softmax_layer.cu +++ b/src/caffe/layers/softmax_layer.cu @@ -10,8 +10,7 @@ #include "caffe/util/math_functions.hpp" #ifdef USE_GREENTEA -#include "caffe/greentea/greentea_im2col.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/im2col.hpp" #endif namespace caffe { @@ -146,49 +145,52 @@ void SoftmaxLayer::Forward_gpu(const vector*>& bottom, this->device_->id()); viennacl::ocl::program &program = this->device_->program(); - greentea_copy(count, (cl_mem) bottom_data, 0, (cl_mem) top_data, 0, - &ctx); + caffe_copy(count, bottom_data, top_data); viennacl::ocl::kernel &oclk_channel_max = program.get_kernel( CL_KERNEL_SELECT("kernel_channel_max")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + ClMemOff buf_scale = clState.get_buffer_mem(scale_data); + viennacl::ocl::enqueue( oclk_channel_max(outer_num_, channels, inner_num_, - WrapHandle((cl_mem) top_data, &ctx), - WrapHandle((cl_mem) scale_data, &ctx)), + WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_scale.memobj, &ctx)), ctx.get_queue()); viennacl::ocl::kernel &oclk_channel_subtract = program.get_kernel( CL_KERNEL_SELECT("kernel_channel_subtract")); viennacl::ocl::enqueue( oclk_channel_subtract(count, outer_num_, channels, inner_num_, - WrapHandle((cl_mem) scale_data, &ctx), - WrapHandle((cl_mem) top_data, &ctx)), + WrapHandle(buf_scale.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); viennacl::ocl::kernel &oclk_exp = program.get_kernel( CL_KERNEL_SELECT("kernel_exp")); viennacl::ocl::enqueue( oclk_exp(count, - WrapHandle((cl_mem) top_data, &ctx), - WrapHandle((cl_mem) top_data, &ctx)), + WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); viennacl::ocl::kernel &oclk_channel_sum = program.get_kernel( CL_KERNEL_SELECT("kernel_channel_sum")); viennacl::ocl::enqueue( oclk_channel_sum(outer_num_, channels, inner_num_, - WrapHandle((cl_mem) top_data, &ctx), - WrapHandle((cl_mem) scale_data, &ctx)), + WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_scale.memobj, &ctx)), ctx.get_queue()); viennacl::ocl::kernel &oclk_channel_div = program.get_kernel( CL_KERNEL_SELECT("kernel_channel_div")); viennacl::ocl::enqueue( oclk_channel_div(count, outer_num_, channels, inner_num_, - WrapHandle((cl_mem) scale_data, &ctx), - WrapHandle((cl_mem) top_data, &ctx)), + WrapHandle(buf_scale.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); - #endif } } @@ -228,30 +230,33 @@ void SoftmaxLayer::Backward_gpu(const vector*>& top, this->device_->id()); viennacl::ocl::program &program = this->device_->program(); - greentea_copy(top[0]->count(), (cl_mem)top_diff, - 0, (cl_mem)bottom_diff, 0, &ctx); + caffe_copy(top[0]->count(), top_diff, bottom_diff); viennacl::ocl::kernel &oclk_channel_dot = program.get_kernel( CL_KERNEL_SELECT("kernel_channel_dot")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_top_diff = clState.get_buffer_mem(top_diff); + ClMemOff buf_top_data = clState.get_buffer_mem(top_data); + ClMemOff buf_scale = clState.get_buffer_mem(scale_data); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + viennacl::ocl::enqueue( oclk_channel_dot(outer_num_, channels, inner_num_, - WrapHandle((cl_mem)top_diff, &ctx), - WrapHandle((cl_mem)top_data, &ctx), - WrapHandle((cl_mem)scale_data, &ctx)), + WrapHandle(buf_top_diff.memobj, &ctx), + WrapHandle(buf_top_data.memobj, &ctx), + WrapHandle(buf_scale.memobj, &ctx)), ctx.get_queue()); viennacl::ocl::kernel &oclk_channel_subtract = program.get_kernel( CL_KERNEL_SELECT("kernel_channel_subtract")); viennacl::ocl::enqueue( oclk_channel_subtract(count, outer_num_, channels, inner_num_, - WrapHandle((cl_mem)scale_data, &ctx), - WrapHandle((cl_mem)bottom_diff, &ctx)), + WrapHandle(buf_scale.memobj, &ctx), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); - greentea_gpu_mul(this->device_->id(), top[0]->count(), - (cl_mem)bottom_diff, 0, - (cl_mem)top_data, 0, (cl_mem)bottom_diff, 0); - + caffe_gpu_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); #endif } } diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu index e58484d9a19..a24ad9bf01c 100644 --- a/src/caffe/layers/softmax_loss_layer.cu +++ b/src/caffe/layers/softmax_loss_layer.cu @@ -6,8 +6,7 @@ #include "caffe/util/math_functions.hpp" #ifdef USE_GREENTEA -#include "caffe/greentea/greentea_im2col.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/im2col.hpp" #endif namespace caffe { @@ -43,6 +42,7 @@ template void SoftmaxWithLossLayer::Forward_gpu( const vector*>& bottom, const vector*>& top) { softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); + if (this->device_->backend() == BACKEND_CUDA) { #ifdef USE_CUDA const Dtype* prob_data = prob_.gpu_data(); @@ -79,43 +79,47 @@ void SoftmaxWithLossLayer::Forward_gpu( #endif // USE_CUDA } else { #ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - viennacl::ocl::program &program = this->device_->program(); + viennacl::ocl::context &ctx = viennacl::ocl::get_context( + this->device_->id()); + viennacl::ocl::program &program = this->device_->program(); - cl_mem prob_data = (cl_mem) (prob_.gpu_data()); - cl_mem label = (cl_mem) (bottom[1]->gpu_data()); - const int_tp dim = prob_.count() / outer_num_; - const int_tp nthreads = outer_num_ * inner_num_; - cl_mem loss_data = (cl_mem) (bottom[0]->mutable_gpu_diff()); - cl_mem counts = (cl_mem) (prob_.mutable_gpu_diff()); - - viennacl::ocl::kernel &oclk_softmax_loss_forward = program.get_kernel( - CL_KERNEL_SELECT("softmax_loss_forward")); - viennacl::ocl::enqueue( - oclk_softmax_loss_forward(nthreads, WrapHandle(prob_data, &ctx), - WrapHandle(label, &ctx), - WrapHandle(loss_data, &ctx), outer_num_, dim, - inner_num_, has_ignore_label_ ? 1 : 0, - ignore_label_, WrapHandle(counts, &ctx)), - ctx.get_queue()); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_prob = clState.get_buffer_mem(prob_.gpu_data()); + ClMemOff buf_label = clState.get_buffer_mem(bottom[1]->gpu_data()); + ClMemOff buf_loss = + clState.get_buffer_mem(bottom[0]->mutable_gpu_diff()); + ClMemOff buf_counts = + clState.get_buffer_mem(prob_.mutable_gpu_diff()); - Dtype loss; - greentea_gpu_asum(this->device_->id(), nthreads, loss_data, 0, - &loss); - Dtype valid_count = -1; - // Only launch another CUDA kernel if we actually need the count of valid - // outputs. - if (normalization_ == LossParameter_NormalizationMode_VALID - && has_ignore_label_) { - greentea_gpu_asum(this->device_->id(), nthreads, counts, 0, - &valid_count); - } - top[0]->mutable_cpu_data()[0] = loss - / get_normalizer(normalization_, valid_count); - if (top.size() >= 2) { - top[1]->ShareData(prob_); - } + Dtype* loss_data = bottom[0]->mutable_gpu_diff(); + Dtype* counts = prob_.mutable_gpu_diff(); + const int_tp dim = prob_.count() / outer_num_; + const int_tp nthreads = outer_num_ * inner_num_; + + viennacl::ocl::kernel &oclk_softmax_loss_forward = program.get_kernel( + CL_KERNEL_SELECT("softmax_loss_forward")); + viennacl::ocl::enqueue( + oclk_softmax_loss_forward(nthreads, WrapHandle(buf_prob.memobj, &ctx), + WrapHandle(buf_label.memobj, &ctx), + WrapHandle(buf_loss.memobj, &ctx), + outer_num_, dim, + inner_num_, has_ignore_label_ ? 1 : 0, + ignore_label_, + WrapHandle(buf_counts.memobj, &ctx)), + ctx.get_queue()); + + Dtype loss; + caffe_gpu_asum(nthreads, loss_data, &loss); + Dtype valid_count = -1; + if (normalization_ == LossParameter_NormalizationMode_VALID + && has_ignore_label_) { + caffe_gpu_asum(nthreads, counts, &valid_count); + } + top[0]->mutable_cpu_data()[0] = loss + / get_normalizer(normalization_, valid_count); + if (top.size() >= 2) { + top[1]->ShareData(prob_); + } #endif // USE_GREENTEA } } @@ -187,39 +191,45 @@ void SoftmaxWithLossLayer::Backward_gpu( #endif // USE_CUDA } else { #ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - viennacl::ocl::program &program = this->device_->program(); - - cl_mem bottom_diff = (cl_mem)(bottom[0]->mutable_gpu_diff()); - cl_mem prob_data = (cl_mem)(prob_.gpu_data()); - cl_mem top_data = (cl_mem)(top[0]->gpu_data()); - greentea_gpu_memcpy(prob_.count() * sizeof(Dtype), - prob_data, 0, bottom_diff, 0, &ctx); - cl_mem label = (cl_mem)(bottom[1]->gpu_data()); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const Dtype* prob_data = prob_.gpu_data(); + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); + const Dtype* label = bottom[1]->gpu_data(); const int_tp dim = prob_.count() / outer_num_; const int_tp nthreads = outer_num_ * inner_num_; - cl_mem counts = (cl_mem)(prob_.mutable_gpu_diff()); - + // Since this memory is never used for anything else, + // we use to to avoid allocating new GPU memory. + Dtype* counts = prob_.mutable_gpu_diff(); + // NOLINT_NEXT_LINE(whitespace/operators) + viennacl::ocl::context &ctx = viennacl::ocl::get_context( + this->device_->id()); + viennacl::ocl::program &program = this->device_->program(); viennacl::ocl::kernel &oclk_softmax_loss_backward = program.get_kernel( CL_KERNEL_SELECT("softmax_loss_backward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_label = clState.get_buffer_mem(label); + ClMemOff buf_counts = clState.get_buffer_mem(counts); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_softmax_loss_backward(nthreads, WrapHandle(top_data, &ctx), - WrapHandle(label, &ctx), WrapHandle(bottom_diff, &ctx), + oclk_softmax_loss_backward(nthreads, WrapHandle(buf_top.memobj, &ctx), + WrapHandle(buf_label.memobj, &ctx), + WrapHandle(buf_bottom.memobj, &ctx), outer_num_, dim, inner_num_, has_ignore_label_ ? 1 : 0, - ignore_label_, WrapHandle(counts, &ctx)), + ignore_label_, WrapHandle(buf_counts.memobj, &ctx)), ctx.get_queue()); Dtype valid_count = -1; if (normalization_ == LossParameter_NormalizationMode_VALID && has_ignore_label_) { - greentea_gpu_asum(this->device_->id(), - nthreads, counts, 0, &valid_count); + caffe_gpu_asum(nthreads, counts, &valid_count); } const Dtype loss_weight = top[0]->cpu_diff()[0] / get_normalizer(normalization_, valid_count); - greentea_gpu_scal(this->device_->id(), - prob_.count(), loss_weight, bottom_diff, 0); + caffe_gpu_scal(prob_.count(), loss_weight , bottom_diff); #endif // USE_GREENTEA } } diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/split_layer.cu index 8503e2a82c0..ed73e866c43 100644 --- a/src/caffe/layers/split_layer.cu +++ b/src/caffe/layers/split_layer.cu @@ -21,43 +21,17 @@ void SplitLayer::Backward_gpu(const vector*>& top, return; } - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (top.size() == 1) { + if (top.size() == 1) { caffe_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); return; - } - caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), - bottom[0]->mutable_gpu_diff()); - // Add remaining top blob diffs. - for (int_tp i = 2; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - this->device_->id()); - - if (top.size() == 1) { - greentea_copy(count_, (cl_mem) (top[0]->gpu_diff()), 0, - (cl_mem) (bottom[0]->mutable_gpu_diff()), 0, &ctx); - return; - } - greentea_gpu_add(this->device_->id(), count_, - (cl_mem) (top[0]->gpu_diff()), 0, - (cl_mem) (top[1]->gpu_diff()), 0, - (cl_mem) (bottom[0]->mutable_gpu_diff()), 0); - // Add remaining top blob diffs. - for (int_tp i = 2; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - greentea_gpu_axpy(this->device_->id(), count_, Dtype(1.), - (cl_mem) top_diff, 0, (cl_mem) bottom_diff, 0); - } -#endif // USE_GREENTEA + } + caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), + bottom[0]->mutable_gpu_diff()); + // Add remaining top blob diffs. + for (int_tp i = 2; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); } } diff --git a/src/caffe/layers/tanh_layer.cu b/src/caffe/layers/tanh_layer.cu index eeebf81745c..4c67c10687e 100644 --- a/src/caffe/layers/tanh_layer.cu +++ b/src/caffe/layers/tanh_layer.cu @@ -39,9 +39,14 @@ void TanHLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::kernel &oclk_tanh = program.get_kernel( CL_KERNEL_SELECT("tanh_forward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_tanh(count, WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) top_data, &ctx)), + oclk_tanh(count, WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -84,10 +89,16 @@ void TanHLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_tanh = program.get_kernel( CL_KERNEL_SELECT("tanh_backward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + ClMemOff buf_top_diff = clState.get_buffer_mem(top_diff); + ClMemOff buf_top_data = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_tanh(count, WrapHandle((cl_mem) top_diff, &ctx), - WrapHandle((cl_mem) top_data, &ctx), - WrapHandle((cl_mem) bottom_diff, &ctx)), + oclk_tanh(count, WrapHandle(buf_top_diff.memobj, &ctx), + WrapHandle(buf_top_data.memobj, &ctx), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/layers/threshold_layer.cu b/src/caffe/layers/threshold_layer.cu index b3486f4c318..ca420b1d3d0 100644 --- a/src/caffe/layers/threshold_layer.cu +++ b/src/caffe/layers/threshold_layer.cu @@ -4,7 +4,7 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/math_functions.hpp" #endif namespace caffe { @@ -42,10 +42,14 @@ void ThresholdLayer::Forward_gpu(const vector*>& bottom, viennacl::ocl::kernel &oclk_threshold = program.get_kernel( CL_KERNEL_SELECT("threshold")); + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( oclk_threshold(count, threshold_, - WrapHandle((cl_mem) bottom_data, &ctx), - WrapHandle((cl_mem) top_data, &ctx)), + WrapHandle(buf_bottom.memobj, &ctx), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); ctx.get_queue().finish(); #endif // USE_GREENTEA diff --git a/src/caffe/layers/tile_layer.cu b/src/caffe/layers/tile_layer.cu index 15d0a114135..df618902b2a 100644 --- a/src/caffe/layers/tile_layer.cu +++ b/src/caffe/layers/tile_layer.cu @@ -5,7 +5,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif @@ -47,10 +46,15 @@ void TileLayer::Forward_gpu( viennacl::ocl::kernel &oclk_tile = program.get_kernel( CL_KERNEL_SELECT("tile")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_data); + ClMemOff buf_top = clState.get_buffer_mem(top_data); + viennacl::ocl::enqueue( - oclk_tile(nthreads, WrapHandle((cl_mem) bottom_data, &ctx), inner_dim_, + oclk_tile(nthreads, WrapHandle(buf_bottom.memobj, &ctx), inner_dim_, tiles_, bottom_tile_axis, - WrapHandle((cl_mem) top_data, &ctx)), + WrapHandle(buf_top.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } @@ -100,10 +104,15 @@ void TileLayer::Backward_gpu(const vector*>& top, viennacl::ocl::kernel &oclk_tile = program.get_kernel( CL_KERNEL_SELECT("tile_backward")); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_bottom = clState.get_buffer_mem(bottom_diff); + ClMemOff buf_top = clState.get_buffer_mem(top_diff); + viennacl::ocl::enqueue( - oclk_tile(nthreads, WrapHandle((cl_mem) top_diff, &ctx), tile_size, + oclk_tile(nthreads, WrapHandle(buf_top.memobj, &ctx), tile_size, tiles_, bottom_tile_axis, - WrapHandle((cl_mem) bottom_diff, &ctx)), + WrapHandle(buf_bottom.memobj, &ctx)), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 70c66c10ff2..24634f3867c 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -984,18 +984,8 @@ void Net::ClearParamDiffs() { break; case Caffe::GPU: #ifndef CPU_ONLY - if (device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA caffe_gpu_set(blob->count(), static_cast(0), blob->mutable_gpu_diff()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_set(device_->id(), - blob->count(), static_cast(0), - (cl_mem)(blob->mutable_gpu_diff()), 0); -#endif // USE_GREENTEA - } #else NO_GPU; #endif diff --git a/src/caffe/solvers/adadelta_solver.cu b/src/caffe/solvers/adadelta_solver.cu index 97daecf41a9..b153d608ce2 100644 --- a/src/caffe/solvers/adadelta_solver.cu +++ b/src/caffe/solvers/adadelta_solver.cu @@ -3,7 +3,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -38,10 +37,15 @@ void adadelta_update_gpu(device* dev, int_tp N, Dtype* g, Dtype* h, Dtype* h2, viennacl::ocl::program &program = dev->program(); viennacl::ocl::kernel &oclk_ada_delta_update = program.get_kernel( CL_KERNEL_SELECT("ada_delta_update")); + ClState& clState = Caffe::cl_state(); + ClMemOff bufg = clState.get_buffer_mem(g); + ClMemOff bufh = clState.get_buffer_mem(h); + ClMemOff bufh2 = clState.get_buffer_mem(h2); + viennacl::ocl::enqueue( - oclk_ada_delta_update(N, WrapHandle((cl_mem) g, &ctx), - WrapHandle((cl_mem) h, &ctx), - WrapHandle((cl_mem) h2, &ctx), momentum, delta, + oclk_ada_delta_update(N, WrapHandle(bufg.memobj, &ctx), + WrapHandle(bufh.memobj, &ctx), + WrapHandle(bufh2.memobj, &ctx), momentum, delta, local_rate), ctx.get_queue()); #endif // USE_GREENTEA diff --git a/src/caffe/solvers/adagrad_solver.cu b/src/caffe/solvers/adagrad_solver.cu index 347285807c7..64695d62dc4 100644 --- a/src/caffe/solvers/adagrad_solver.cu +++ b/src/caffe/solvers/adagrad_solver.cu @@ -3,7 +3,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -36,9 +35,14 @@ void adagrad_update_gpu(device* dev, int_tp N, Dtype* g, Dtype* h, Dtype delta, viennacl::ocl::program &program = dev->program(); viennacl::ocl::kernel &oclk_ada_grad_update = program.get_kernel( CL_KERNEL_SELECT("ada_grad_update")); + + ClState& clState = Caffe::cl_state(); + ClMemOff bufg = clState.get_buffer_mem(g); + ClMemOff bufh = clState.get_buffer_mem(h); + viennacl::ocl::enqueue( - oclk_ada_grad_update(N, WrapHandle((cl_mem) g, &ctx), - WrapHandle((cl_mem) h, &ctx), delta, local_rate), + oclk_ada_grad_update(N, WrapHandle(bufg.memobj, &ctx), + WrapHandle(bufh.memobj, &ctx), delta, local_rate), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/solvers/adam_solver.cu b/src/caffe/solvers/adam_solver.cu index 5fc35918ad5..01ba5d47413 100644 --- a/src/caffe/solvers/adam_solver.cu +++ b/src/caffe/solvers/adam_solver.cu @@ -3,7 +3,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -39,10 +38,15 @@ void adam_update_gpu(device* dev, int_tp N, Dtype* g, Dtype* m, Dtype* v, viennacl::ocl::program &program = dev->program(); viennacl::ocl::kernel &oclk_adam_update = program.get_kernel( CL_KERNEL_SELECT("adam_update")); + ClState& clState = Caffe::cl_state(); + ClMemOff bufg = clState.get_buffer_mem(g); + ClMemOff bufm = clState.get_buffer_mem(m); + ClMemOff bufv = clState.get_buffer_mem(v); + viennacl::ocl::enqueue( - oclk_adam_update(N, WrapHandle((cl_mem) g, &ctx), - WrapHandle((cl_mem) m, &ctx), - WrapHandle((cl_mem) v, &ctx), beta1, beta2, eps_hat, + oclk_adam_update(N, WrapHandle(bufg.memobj, &ctx), + WrapHandle(bufm.memobj, &ctx), + WrapHandle(bufv.memobj, &ctx), beta1, beta2, eps_hat, corrected_local_rate), ctx.get_queue()); #endif // USE_GREENTEA diff --git a/src/caffe/solvers/nesterov_solver.cu b/src/caffe/solvers/nesterov_solver.cu index 9a0d491a59e..a5ee5180bcc 100644 --- a/src/caffe/solvers/nesterov_solver.cu +++ b/src/caffe/solvers/nesterov_solver.cu @@ -3,7 +3,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -36,9 +35,13 @@ void nesterov_update_gpu(device* dev, int_tp N, Dtype* g, Dtype* h, viennacl::ocl::program &program = dev->program(); viennacl::ocl::kernel &oclk_nesterov_update = program.get_kernel( CL_KERNEL_SELECT("nesterov_update")); + ClState& clState = Caffe::cl_state(); + ClMemOff bufg = clState.get_buffer_mem(g); + ClMemOff bufh = clState.get_buffer_mem(h); + viennacl::ocl::enqueue( - oclk_nesterov_update(N, WrapHandle((cl_mem) g, &ctx), - WrapHandle((cl_mem) h, &ctx), momentum, + oclk_nesterov_update(N, WrapHandle(bufg.memobj, &ctx), + WrapHandle(bufh.memobj, &ctx), momentum, local_rate), ctx.get_queue()); #endif // USE_GREENTEA diff --git a/src/caffe/solvers/rmsprop_solver.cu b/src/caffe/solvers/rmsprop_solver.cu index dc62df571f0..22adb782f24 100644 --- a/src/caffe/solvers/rmsprop_solver.cu +++ b/src/caffe/solvers/rmsprop_solver.cu @@ -3,7 +3,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -36,11 +35,14 @@ void rmsprop_update_gpu(device* dev, int_tp N, Dtype* g, Dtype* h, viennacl::ocl::program &program = dev->program(); viennacl::ocl::kernel &oclk_rms_prop_update = program.get_kernel( CL_KERNEL_SELECT("rms_prop_update")); + ClState& clState = Caffe::cl_state(); + ClMemOff bufg = clState.get_buffer_mem(g); + ClMemOff bufh = clState.get_buffer_mem(h); viennacl::ocl::enqueue( - oclk_rms_prop_update(N, WrapHandle((cl_mem) g, &ctx), - WrapHandle((cl_mem) h, &ctx), - rms_decay, delta, - local_rate), + oclk_rms_prop_update(N, WrapHandle(bufg.memobj, &ctx), + WrapHandle(bufh.memobj, &ctx), + rms_decay, delta, + local_rate), ctx.get_queue()); #endif // USE_GREENTEA } diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index 86dfa2b3ebe..0e07f8988b9 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -137,19 +137,8 @@ void SGDSolver::Normalize(int param_id) { } case Caffe::GPU: { #ifndef CPU_ONLY - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, - net_params[param_id]->mutable_gpu_diff()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_scal(this->device_->id(), - net_params[param_id]->count(), accum_normalization, - (cl_mem) (net_params[param_id]->mutable_gpu_diff()), - 0); -#endif // USE_GREENTEA - } + caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, + net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif @@ -191,55 +180,25 @@ void SGDSolver::Regularize(int param_id) { } case Caffe::GPU: { #ifndef CPU_ONLY - if (this->device_->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else if (regularization_type == "L1") { - caffe_gpu_sign(net_params[param_id]->count(), - net_params[param_id]->gpu_data(), - temp_[param_id]->mutable_gpu_data()); - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - temp_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else { - LOG(FATAL)<< "Unknown regularization type: " - << regularization_type; - } - } -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - greentea_gpu_axpy(this->device_->id(), - net_params[param_id]->count(), - local_decay, - (cl_mem)(net_params[param_id]->gpu_data()), 0, - (cl_mem)(net_params[param_id]->mutable_gpu_diff()), 0); - } else if (regularization_type == "L1") { - greentea_gpu_sign(this->device_->id(), - net_params[param_id]->count(), - (cl_mem)(net_params[param_id]->gpu_data()), 0, - (cl_mem)(temp_[param_id]->mutable_gpu_data()), 0); - greentea_gpu_axpy(this->device_->id(), - net_params[param_id]->count(), - local_decay, - (cl_mem)(temp_[param_id]->gpu_data()), 0, - (cl_mem)(net_params[param_id]->mutable_gpu_diff()), 0); - } else { - LOG(FATAL)<< "Unknown regularization type: " - << regularization_type; - } + if (local_decay) { + if (regularization_type == "L2") { + // add weight decay + caffe_gpu_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else if (regularization_type == "L1") { + caffe_gpu_sign(net_params[param_id]->count(), + net_params[param_id]->gpu_data(), + temp_[param_id]->mutable_gpu_data()); + caffe_gpu_axpy(net_params[param_id]->count(), + local_decay, + temp_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } else { + LOG(FATAL)<< "Unknown regularization type: " + << regularization_type; } -#endif // USE_GREENTEA } #else NO_GPU; diff --git a/src/caffe/solvers/sgd_solver.cu b/src/caffe/solvers/sgd_solver.cu index d0cd2cb26f0..ef615f7889e 100644 --- a/src/caffe/solvers/sgd_solver.cu +++ b/src/caffe/solvers/sgd_solver.cu @@ -3,7 +3,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -34,10 +33,16 @@ void sgd_update_gpu(device* dev, int_tp N, Dtype* g, Dtype* h, Dtype momentum, viennacl::ocl::program &program = dev->program(); viennacl::ocl::kernel &oclk_sgd_update = program.get_kernel( CL_KERNEL_SELECT("sgd_update")); + + ClState& clState = Caffe::cl_state(); + ClMemOff bufg = clState.get_buffer_mem(g); + ClMemOff bufh = clState.get_buffer_mem(h); + viennacl::ocl::enqueue( - oclk_sgd_update(N, WrapHandle((cl_mem) g, &ctx), - WrapHandle((cl_mem) h, &ctx), momentum, local_rate), + oclk_sgd_update(N, WrapHandle(bufg.memobj, &ctx), + WrapHandle(bufh.memobj, &ctx), momentum, local_rate), ctx.get_queue()); + #endif // USE_GREENTEA } } diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 532d6ad406d..1d69581cbcc 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -6,8 +6,7 @@ #include "caffe/util/math_functions.hpp" #ifdef USE_GREENTEA -#include "caffe/greentea/greentea_im2col.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/im2col.hpp" #define ZEROCOPY_SUPPORTED(device, ptr, size) \ (device->is_host_unified() &&\ @@ -83,8 +82,9 @@ SyncedMemory::~SyncedMemory() { viennacl::ocl::context &ctx = viennacl::ocl::get_context( device_->id()); ctx.get_queue().finish(); - CHECK_EQ(CL_SUCCESS, clReleaseMemObject(cl_gpu_mem_)) - << "OpenCL memory corruption"; + + Caffe::cl_state().destroy_buffer(gpu_ptr_); + gpu_ptr_ = nullptr; cl_gpu_mem_ = nullptr; ctx.get_queue().finish(); @@ -133,10 +133,13 @@ inline void SyncedMemory::to_cpu() { viennacl::ocl::context &ctx = viennacl::ocl::get_context( device_->id()); if (!own_zero_copy_data_) { - greentea_gpu_memcpy(size_, (cl_mem) gpu_ptr_, 0, cpu_ptr_, &ctx); + caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); } else { + ClState& clState = Caffe::cl_state(); + ClMemOff buf_gpu = clState.get_buffer_mem(gpu_ptr_); + void *mapped_ptr = clEnqueueMapBuffer(ctx.get_queue().handle().get(), - (cl_mem) gpu_ptr_, + buf_gpu.memobj, true, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); @@ -144,7 +147,7 @@ inline void SyncedMemory::to_cpu() { << "Device claims it support zero copy" << " but failed to create correct user ptr buffer"; clEnqueueUnmapMemObject(ctx.get_queue().handle().get(), - (cl_mem) gpu_ptr_, + buf_gpu.memobj, mapped_ptr, 0, NULL, NULL); } ctx.get_queue().finish(); @@ -158,7 +161,7 @@ inline void SyncedMemory::to_cpu() { } case HEAD_AT_CPU: case SYNCED: - break; + break; } } @@ -179,10 +182,11 @@ inline void SyncedMemory::to_gpu() { device_->id()); ctx.get_queue().finish(); cl_int err; + if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { - cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - size_, nullptr, &err); + gpu_ptr_ = Caffe::cl_state().create_buffer(device_->id(), + CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size_, NULL, &err); + cl_gpu_mem_ = Caffe::cl_state().get_buffer_mem(gpu_ptr_).memobj; } else if (device_->is_host_unified()) { // auto saved_mode = Caffe::mode(); // Caffe::set_mode(Caffe::GPU); @@ -190,9 +194,11 @@ inline void SyncedMemory::to_gpu() { // Caffe::set_mode(saved_mode); caffe_memset(size_, 0, cpu_ptr_); own_cpu_data_ = true; - cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), - CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, - size_, cpu_ptr_, &err); + + gpu_ptr_ = Caffe::cl_state().create_buffer(device_->id(), + CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size_, cpu_ptr_, &err); + cl_gpu_mem_ = Caffe::cl_state().get_buffer_mem(gpu_ptr_).memobj; + void *mapped_ptr = clEnqueueMapBuffer( ctx.get_queue().handle().get(), cl_gpu_mem_, @@ -208,10 +214,11 @@ inline void SyncedMemory::to_gpu() { own_zero_copy_data_ = true; } - if (cl_gpu_mem_ == nullptr) - cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), - CL_MEM_READ_WRITE, - size_, nullptr, &err); + if (cl_gpu_mem_ == nullptr) { + gpu_ptr_ = Caffe::cl_state().create_buffer(device_->id(), + CL_MEM_READ_WRITE, size_, NULL, &err); + cl_gpu_mem_ = Caffe::cl_state().get_buffer_mem(gpu_ptr_).memobj; + } CHECK_EQ(0, err) << "OpenCL buffer allocation of size " << size_ << " failed."; @@ -219,9 +226,8 @@ inline void SyncedMemory::to_gpu() { device_->IncreaseMemoryUsage(size_); if (!own_zero_copy_data_) { int_tp alpha = 0; - greentea_memset(device_->id(), size_, alpha, cl_gpu_mem_, 0); + caffe_gpu_memset(size_, alpha, gpu_ptr_); } - gpu_ptr_ = reinterpret_cast(cl_gpu_mem_); ctx.get_queue().finish(); own_gpu_data_ = true; #endif // USE_GREENTEA @@ -244,19 +250,21 @@ inline void SyncedMemory::to_gpu() { viennacl::ocl::context &ctx = viennacl::ocl::get_context( device_->id()); ctx.get_queue().finish(); + if (gpu_ptr_ == nullptr) { cl_int err; if (ctx.devices()[0].type() == CL_DEVICE_TYPE_CPU) { - cl_gpu_mem_ = clCreateBuffer( - ctx.handle().get(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - size_, nullptr, &err); + gpu_ptr_ = Caffe::cl_state().create_buffer(device_->id(), + CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size_, NULL, &err); + cl_gpu_mem_ = Caffe::cl_state().get_buffer_mem(gpu_ptr_).memobj; } else if (ZEROCOPY_SUPPORTED(device_, cpu_ptr_, size_)) { - cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), - CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, - size_, cpu_ptr_, &err); + gpu_ptr_ = Caffe::cl_state().create_buffer(device_->id(), + CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size_, cpu_ptr_, &err); + cl_gpu_mem_ = Caffe::cl_state().get_buffer_mem(gpu_ptr_).memobj; + void *mapped_ptr = clEnqueueMapBuffer( ctx.get_queue().handle().get(), - (cl_mem) cl_gpu_mem_, + cl_gpu_mem_, true, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL); @@ -268,17 +276,19 @@ inline void SyncedMemory::to_gpu() { mapped_ptr, 0, NULL, NULL); own_zero_copy_data_ = true; } - if (cl_gpu_mem_ == nullptr) - cl_gpu_mem_ = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, - size_, nullptr, &err); + if (cl_gpu_mem_ == nullptr) { + gpu_ptr_ = Caffe::cl_state().create_buffer(device_->id(), + CL_MEM_READ_WRITE, size_, NULL, &err); + cl_gpu_mem_ = Caffe::cl_state().get_buffer_mem(gpu_ptr_).memobj; + } CHECK_EQ(0, err) << "OpenCL buffer allocation of size " << size_ << " failed."; device_->IncreaseMemoryUsage(size_); - gpu_ptr_ = reinterpret_cast(cl_gpu_mem_); ctx.get_queue().finish(); } - if (!own_zero_copy_data_) - greentea_gpu_memcpy(size_, cpu_ptr_, (cl_mem) gpu_ptr_, 0, &ctx); + if (!own_zero_copy_data_) { + caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); + } ctx.get_queue().finish(); own_gpu_data_ = true; #endif // USE_GREENTEA diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp index 384e3ea8e56..695f6619af2 100644 --- a/src/caffe/test/test_math_functions.cpp +++ b/src/caffe/test/test_math_functions.cpp @@ -13,7 +13,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -141,38 +140,16 @@ TYPED_TEST(GPUMathFunctionsTest, TestAsum) { } TypeParam gpu_asum; - device *dc = Caffe::GetDefaultDevice(); + caffe_gpu_asum(n, this->blob_bottom_->gpu_data(), &gpu_asum); - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_asum(n, this->blob_bottom_->gpu_data(), &gpu_asum); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_asum(dc->id(), n, - (cl_mem)(this->blob_bottom_->gpu_data()), 0, &gpu_asum); -#endif // USE_GREENTEA - } EXPECT_LT((gpu_asum - std_asum) / std_asum, 1e-2); } TYPED_TEST(GPUMathFunctionsTest, TestSign) { int_tp n = this->blob_bottom_->count(); - device *dc = Caffe::GetDefaultDevice(); - - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_sign(n, this->blob_bottom_->gpu_data(), + caffe_gpu_sign(n, this->blob_bottom_->gpu_data(), this->blob_bottom_->mutable_gpu_diff()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_sign(dc->id(), n, - (cl_mem)(this->blob_bottom_->gpu_data()), 0, - (cl_mem)(this->blob_bottom_->mutable_gpu_diff()), 0); -#endif // USE_GREENTEA - } const TypeParam* signs = this->blob_bottom_->cpu_diff(); const TypeParam* x = this->blob_bottom_->cpu_data(); @@ -184,20 +161,8 @@ TYPED_TEST(GPUMathFunctionsTest, TestSign) { TYPED_TEST(GPUMathFunctionsTest, TestSgnbit) { int_tp n = this->blob_bottom_->count(); - device *dc = Caffe::GetDefaultDevice(); - - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_sgnbit(n, this->blob_bottom_->gpu_data(), - this->blob_bottom_->mutable_gpu_diff()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_sgnbit(dc->id(), n, - (cl_mem)(this->blob_bottom_->gpu_data()), 0, - (cl_mem)(this->blob_bottom_->mutable_gpu_diff()), 0); -#endif // USE_GREENTEA - } + caffe_gpu_sgnbit(n, this->blob_bottom_->gpu_data(), + this->blob_bottom_->mutable_gpu_diff()); const TypeParam* signbits = this->blob_bottom_->cpu_diff(); const TypeParam* x = this->blob_bottom_->cpu_data(); @@ -209,20 +174,8 @@ TYPED_TEST(GPUMathFunctionsTest, TestSgnbit) { TYPED_TEST(GPUMathFunctionsTest, TestFabs) { int_tp n = this->blob_bottom_->count(); - device *dc = Caffe::GetDefaultDevice(); - - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_abs(n, this->blob_bottom_->gpu_data(), - this->blob_bottom_->mutable_gpu_diff()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_abs(dc->id(), n, - (cl_mem)(this->blob_bottom_->gpu_data()), 0, - (cl_mem)(this->blob_bottom_->mutable_gpu_diff()), 0); -#endif // USE_GREENTEA - } + caffe_gpu_abs(n, this->blob_bottom_->gpu_data(), + this->blob_bottom_->mutable_gpu_diff()); const TypeParam* abs_val = this->blob_bottom_->cpu_diff(); const TypeParam* x = this->blob_bottom_->cpu_data(); @@ -236,19 +189,8 @@ TYPED_TEST(GPUMathFunctionsTest, TestScale) { TypeParam alpha = this->blob_bottom_->cpu_diff()[caffe_rng_rand() % this->blob_bottom_->count()]; - device *dc = Caffe::GetDefaultDevice(); - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_scale(n, alpha, this->blob_bottom_->gpu_data(), + caffe_gpu_scale(n, alpha, this->blob_bottom_->gpu_data(), this->blob_bottom_->mutable_gpu_diff()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_scale(dc->id(), n, alpha, - (cl_mem)(this->blob_bottom_->gpu_data()), 0, - (cl_mem)(this->blob_bottom_->mutable_gpu_diff()), 0); -#endif // USE_GREENTEA - } const TypeParam* scaled = this->blob_bottom_->cpu_diff(); const TypeParam* x = this->blob_bottom_->cpu_data(); @@ -262,20 +204,7 @@ TYPED_TEST(GPUMathFunctionsTest, TestCopy) { const TypeParam* bottom_data = this->blob_bottom_->gpu_data(); TypeParam* top_data = this->blob_top_->mutable_gpu_data(); - device *dc = Caffe::GetDefaultDevice(); - if (dc->backend() == BACKEND_CUDA) { - #ifdef USE_CUDA - caffe_copy(n, bottom_data, top_data); - #endif // USE_CUDA - } else { - #ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context( - dc->id()); - - greentea_copy(n, (cl_mem)bottom_data, 0, - (cl_mem)top_data, 0, &ctx); - #endif // USE_GREENTEA - } + caffe_copy(n, bottom_data, top_data); bottom_data = this->blob_bottom_->cpu_data(); top_data = this->blob_top_->mutable_cpu_data(); diff --git a/src/caffe/test/test_ocl_kernel_compile.cpp b/src/caffe/test/test_ocl_kernel_compile.cpp index 0fb25a9bb82..0ab85bec3fd 100644 --- a/src/caffe/test/test_ocl_kernel_compile.cpp +++ b/src/caffe/test/test_ocl_kernel_compile.cpp @@ -14,7 +14,7 @@ #include "caffe/greentea/cl_kernels.hpp" #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" +#include "caffe/util/math_functions.hpp" namespace caffe { diff --git a/src/caffe/test/test_random_number_generator.cpp b/src/caffe/test/test_random_number_generator.cpp index 803f803b991..a4ce67dc893 100644 --- a/src/caffe/test/test_random_number_generator.cpp +++ b/src/caffe/test/test_random_number_generator.cpp @@ -10,7 +10,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -182,53 +181,22 @@ class RandomNumberGeneratorTest : public ::testing::Test { void RngGaussianFillGPU(const Dtype mu, const Dtype sigma, void* gpu_data) { Dtype* rng_data = static_cast(gpu_data); - device *dc = Caffe::GetDefaultDevice(); - - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_rng_gaussian(sample_size_, mu, sigma, rng_data); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_rng_gaussian(dc->id(), sample_size_, - mu, sigma, (cl_mem)rng_data, 0); -#endif // USE_GREENTEA - } + caffe_gpu_rng_gaussian(sample_size_, mu, sigma, rng_data); } void RngUniformFillGPU(const Dtype lower, const Dtype upper, void* gpu_data) { CHECK_GE(upper, lower); Dtype* rng_data = static_cast(gpu_data); - device *dc = Caffe::GetDefaultDevice(); - - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_rng_uniform(sample_size_, lower, upper, rng_data); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_rng_uniform(dc->id(), sample_size_, - lower, upper, (cl_mem)rng_data, 0); -#endif // USE_GREENTEA - } + caffe_gpu_rng_uniform(sample_size_, lower, upper, rng_data); } // Fills with uniform integers in [0, UINT_MAX] using 2 argument form of // caffe_gpu_rng_uniform. void RngUniformIntFillGPU(void* gpu_data) { uint_tp* rng_data = static_cast(gpu_data); - device *dc = Caffe::GetDefaultDevice(); - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_rng_uniform(sample_size_, (uint_tpc*)rng_data); // NOLINT -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_rng_uniform(dc->id(), sample_size_, (cl_mem)rng_data, 0); -#endif // USE_GREENTEA - } + caffe_gpu_rng_uniform(sample_size_, reinterpret_cast(rng_data)); } #endif diff --git a/src/caffe/test/test_syncedmem.cpp b/src/caffe/test/test_syncedmem.cpp index 1b7d6dd53ed..ff57698488a 100644 --- a/src/caffe/test/test_syncedmem.cpp +++ b/src/caffe/test/test_syncedmem.cpp @@ -11,7 +11,6 @@ #ifdef USE_GREENTEA #include "caffe/greentea/greentea.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #endif namespace caffe { @@ -86,18 +85,7 @@ TEST_F(SyncedMemoryTest, TestGPURead) { // check if values are the same char* recovered_value = new char[10]; - device *dc = Caffe::GetDefaultDevice(); - - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_memcpy(10, gpu_data, recovered_value); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context(dc->id()); - greentea_gpu_memcpy(10, (cl_mem) gpu_data, 0, recovered_value, &ctx); -#endif // USE_GREENTEA - } + caffe_gpu_memcpy(10, gpu_data, recovered_value); for (int_tp i = 0; i < mem.size(); ++i) { EXPECT_EQ((static_cast(recovered_value))[i], 1); @@ -113,16 +101,7 @@ TEST_F(SyncedMemoryTest, TestGPURead) { EXPECT_EQ(mem.head(), SyncedMemory::SYNCED); // check if values are the same - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_memcpy(10, gpu_data, recovered_value); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - viennacl::ocl::context &ctx = viennacl::ocl::get_context(dc->id()); - greentea_gpu_memcpy(10, (cl_mem) gpu_data, 0, recovered_value, &ctx); -#endif // USE_GREENTEA - } + caffe_gpu_memcpy(10, gpu_data, recovered_value); for (int_tp i = 0; i < mem.size(); ++i) { EXPECT_EQ((static_cast(recovered_value))[i], 2); @@ -135,17 +114,7 @@ TEST_F(SyncedMemoryTest, TestGPUWrite) { void* gpu_data = mem.mutable_gpu_data(); EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_GPU); - device *dc = Caffe::GetDefaultDevice(); - - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_memset(mem.size(), 1, gpu_data); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_memset(dc->id(), mem.size(), 1, (cl_mem) gpu_data, 0); -#endif // USE_GREENTEA - } + caffe_gpu_memset(mem.size(), 1, gpu_data); const void* cpu_data = mem.cpu_data(); for (int_tp i = 0; i < mem.size(); ++i) { @@ -156,15 +125,7 @@ TEST_F(SyncedMemoryTest, TestGPUWrite) { gpu_data = mem.mutable_gpu_data(); EXPECT_EQ(mem.head(), SyncedMemory::HEAD_AT_GPU); - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_memset(mem.size(), 2, gpu_data); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_memset(dc->id(), mem.size(), 2, (cl_mem) gpu_data, 0); -#endif // USE_GREENTEA - } + caffe_gpu_memset(mem.size(), 2, gpu_data); cpu_data = mem.cpu_data(); for (int_tp i = 0; i < mem.size(); ++i) { diff --git a/src/caffe/test/test_util_blas.cpp b/src/caffe/test/test_util_blas.cpp index 43699d09860..b0a41933b1d 100644 --- a/src/caffe/test/test_util_blas.cpp +++ b/src/caffe/test/test_util_blas.cpp @@ -16,8 +16,6 @@ class GemmTest : public ::testing::Test {}; TYPED_TEST_CASE(GemmTest, TestDtypes); TYPED_TEST(GemmTest, TestGemmCPUGPU) { - device *dc = Caffe::GetDefaultDevice(); - Blob A(1, 1, 2, 3, Caffe::GetDefaultDevice()); Blob B(1, 1, 3, 4, Caffe::GetDefaultDevice()); Blob C(1, 1, 2, 4, Caffe::GetDefaultDevice()); @@ -36,21 +34,8 @@ TYPED_TEST(GemmTest, TestGemmCPUGPU) { EXPECT_EQ(C.cpu_data()[i], result[i]); } - - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1., - A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_gemm(dc->id(), CblasNoTrans, CblasNoTrans, - 2, 4, 3, 1., - (cl_mem)(A.gpu_data()), 0, - (cl_mem)(B.gpu_data()), 0, 0., - (cl_mem)(C.mutable_gpu_data()), 0); -#endif // USE_GREENTEA - } + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1., + A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data()); for (int_tp i = 0; i < 8; ++i) { EXPECT_EQ(C.cpu_data()[i], result[i]); @@ -65,20 +50,8 @@ TYPED_TEST(GemmTest, TestGemmCPUGPU) { EXPECT_EQ(C.cpu_data()[i], result[i]); } - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA caffe_gpu_gemm(CblasTrans, CblasNoTrans, 2, 4, 3, 1., A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_gemm(dc->id(), CblasTrans, CblasNoTrans, - 2, 4, 3, 1., - (cl_mem)(A.gpu_data()), 0, - (cl_mem)(B.gpu_data()), 0, - 0., (cl_mem)(C.mutable_gpu_data()), 0); -#endif // USE_GREENTEA - } for (int_tp i = 0; i < 8; ++i) { EXPECT_EQ(C.cpu_data()[i], result[i]); @@ -93,20 +66,8 @@ TYPED_TEST(GemmTest, TestGemmCPUGPU) { EXPECT_EQ(C.cpu_data()[i], result[i]); } - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_gemm(CblasTrans, CblasTrans, 2, 4, 3, 1., - A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_gemm(dc->id(), CblasTrans, CblasTrans, - 2, 4, 3, 1., - (cl_mem)(A.gpu_data()), 0, - (cl_mem)(B.gpu_data()), 0, 0., - (cl_mem)(C.mutable_gpu_data()), 0); -#endif // USE_GREENTEA - } + caffe_gpu_gemm(CblasTrans, CblasTrans, 2, 4, 3, 1., + A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data()); for (int_tp i = 0; i < 8; ++i) { EXPECT_EQ(C.cpu_data()[i], result[i]); @@ -121,20 +82,8 @@ TYPED_TEST(GemmTest, TestGemmCPUGPU) { EXPECT_EQ(C.cpu_data()[i], result[i]); } - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_gemm(CblasNoTrans, CblasTrans, 2, 4, 3, 1., - A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_gemm(dc->id(), CblasNoTrans, CblasTrans, - 2, 4, 3, 1., - (cl_mem)(A.gpu_data()), 0, - (cl_mem)(B.gpu_data()), 0, 0., - (cl_mem)(C.mutable_gpu_data()), 0); -#endif // USE_GREENTEA - } + caffe_gpu_gemm(CblasNoTrans, CblasTrans, 2, 4, 3, 1., + A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data()); for (int_tp i = 0; i < 8; ++i) { EXPECT_EQ(C.cpu_data()[i], result[i]); @@ -143,8 +92,6 @@ TYPED_TEST(GemmTest, TestGemmCPUGPU) { TYPED_TEST(GemmTest, TestGemvCPUGPU) { - device *dc = Caffe::GetDefaultDevice(); - Blob A(1, 1, 2, 3, Caffe::GetDefaultDevice()); Blob x(1, 1, 1, 3, Caffe::GetDefaultDevice()); Blob y(1, 1, 1, 2, Caffe::GetDefaultDevice()); @@ -162,20 +109,8 @@ TYPED_TEST(GemmTest, TestGemvCPUGPU) { EXPECT_EQ(y.cpu_data()[i], result_2[i]); } - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_gemv(CblasNoTrans, 2, 3, 1., A.gpu_data(), - x.gpu_data(), 0., y.mutable_gpu_data()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_gemv(dc->id(), CblasNoTrans, - 2, 3, 1., - (cl_mem)(A.gpu_data()), 0, - (cl_mem)(x.gpu_data()), 0, 0., - (cl_mem)(y.mutable_gpu_data()), 0); -#endif // USE_GREENTEA - } + caffe_gpu_gemv(CblasNoTrans, 2, 3, 1., A.gpu_data(), + x.gpu_data(), 0., y.mutable_gpu_data()); for (int_tp i = 0; i < 2; ++i) { EXPECT_EQ(y.cpu_data()[i], result_2[i]); @@ -189,20 +124,8 @@ TYPED_TEST(GemmTest, TestGemvCPUGPU) { EXPECT_EQ(x.cpu_data()[i], result_3[i]); } - if (dc->backend() == BACKEND_CUDA) { -#ifdef USE_CUDA - caffe_gpu_gemv(CblasTrans, 2, 3, 1., A.gpu_data(), - y.gpu_data(), 0., x.mutable_gpu_data()); -#endif // USE_CUDA - } else { -#ifdef USE_GREENTEA - greentea_gpu_gemv(dc->id(), CblasTrans, - 2, 3, 1., - (cl_mem)(A.gpu_data()), 0, - (cl_mem)(y.gpu_data()), 0, 0., - (cl_mem)(x.mutable_gpu_data()), 0); -#endif // USE_GREENTEA - } + caffe_gpu_gemv(CblasTrans, 2, 3, 1., A.gpu_data(), + y.gpu_data(), 0., x.mutable_gpu_data()); for (int_tp i = 0; i < 3; ++i) { EXPECT_EQ(x.cpu_data()[i], result_3[i]); diff --git a/src/caffe/util/cl_fft.cpp b/src/caffe/util/cl_fft.cpp index 9fe87c8b94a..4057a59f0c8 100644 --- a/src/caffe/util/cl_fft.cpp +++ b/src/caffe/util/cl_fft.cpp @@ -7,8 +7,8 @@ #if defined(USE_GREENTEA) && defined(USE_FFT) #include "caffe/device.hpp" #include "caffe/greentea/cl_kernels.hpp" -#include "caffe/greentea/greentea_math_functions.hpp" #include "caffe/util/fft.hpp" +#include "caffe/util/math_functions.hpp" // #define DEBUG_PROFILE @@ -30,7 +30,7 @@ void kernel_execution_time(cl_event* event, const char* kernel_name) { void clear_gpu_fft_buffer(void* data, const int size) { device *dc = Caffe::GetDefaultDevice(); - greentea_memset(dc->id(), size, 0, (cl_mem) data, 0); + caffe_memset(size, 0, data); } // Copy and cyclic-shift 0 padding of weights to FFT real buffer @@ -54,9 +54,14 @@ void fft_gpu_copy2buffer(Dtype* fft_gpu_weights_real, const Dtype* weight, const int complex_width_len = 2*(fft_width/2 + 1); viennacl::ocl::kernel & kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("copy2buffer_cyclic_shift_in")); - kernel.arg(argIdx++, WrapHandle((cl_mem)fft_gpu_weights_real, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_fft = clState.get_buffer_mem(fft_gpu_weights_real); + ClMemOff buf_weight = clState.get_buffer_mem(weight); + + kernel.arg(argIdx++, WrapHandle(buf_fft.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_fft_gpu_weights_real); - kernel.arg(argIdx++, WrapHandle((cl_mem)weight, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_weight.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_weight); kernel.arg(argIdx++, ker_size); kernel.arg(argIdx++, ch_gr); @@ -112,9 +117,15 @@ void fft_gpu_copy2buffer_in_2D(Dtype* map_out, const Dtype* map_in, CL_KERNEL_SELECT("copy2buffer_left_top_in_2d")); } int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)map_out, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_map_out = clState.get_buffer_mem(map_out); + ClMemOff buf_map_in = clState.get_buffer_mem(map_in); + + kernel.arg(argIdx++, WrapHandle(buf_map_out.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_map_out); - kernel.arg(argIdx++, WrapHandle((cl_mem)map_in, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_map_in.memobj, &ctx)); + kernel.arg(argIdx++, offset_offset_map_in); kernel.arg(argIdx++, map_out_size); kernel.arg(argIdx++, size); @@ -172,9 +183,14 @@ void fft_gpu_copy2buffer_out_forward_2D(Dtype* map_out, int out_offset, CL_KERNEL_SELECT("copy2buffer_left_top_out_2d")); } int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)map_out, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_map_out = clState.get_buffer_mem(map_out); + ClMemOff buf_map_in = clState.get_buffer_mem(map_in); + + kernel.arg(argIdx++, WrapHandle(buf_map_out.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_map_out); - kernel.arg(argIdx++, WrapHandle((cl_mem)map_in, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_map_in.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_map_in); kernel.arg(argIdx++, size); kernel.arg(argIdx++, count); @@ -227,9 +243,15 @@ void fft_gpu_copy2buffer_out_backward(Dtype* map_out, const Dtype* map_in, viennacl::ocl::kernel &kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("copy2buffer_cyclic_shift_out")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)map_out, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_map_out = clState.get_buffer_mem(map_out); + ClMemOff buf_map_in = clState.get_buffer_mem(map_in); + + kernel.arg(argIdx++, WrapHandle(buf_map_out.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_map_out); - kernel.arg(argIdx++, WrapHandle((cl_mem)map_in, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_map_in.memobj, &ctx)); + kernel.arg(argIdx++, offset_offset_map_in); kernel.arg(argIdx++, width_out); kernel.arg(argIdx++, fft_height); @@ -278,9 +300,15 @@ void fft_gpu_copy2buffer_out_backward_2D(Dtype* map_out, int out_offset, viennacl::ocl::kernel &kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("copy2buffer_cyclic_shift_out_2d")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)map_out, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_map_out = clState.get_buffer_mem(map_out); + ClMemOff buf_map_in = clState.get_buffer_mem(map_in); + + kernel.arg(argIdx++, WrapHandle(buf_map_out.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_map_out); - kernel.arg(argIdx++, WrapHandle((cl_mem)map_in, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_map_in.memobj, &ctx)); + kernel.arg(argIdx++, offset_offset_map_in); kernel.arg(argIdx++, map_out_size); kernel.arg(argIdx++, map_in_size); @@ -330,11 +358,17 @@ void caffe_gpu_elementMulConj_1D(DtypeComplex* dst, viennacl::ocl::kernel kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("complex_conjugate_multiplication_1d")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)dst, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_dst = clState.get_buffer_mem(dst); + ClMemOff buf_src1 = clState.get_buffer_mem(src1); + ClMemOff buf_src2 = clState.get_buffer_mem(src2); + + kernel.arg(argIdx++, WrapHandle(buf_dst.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_dst << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src1, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src1.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src1 << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src2, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src2.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src2 << 1); kernel.arg(argIdx++, ch_gr); #ifdef DEBUG_PROFILE @@ -370,8 +404,14 @@ void caffe_gpu_elementMulConj_Reshape(DtypeComplex* dst, viennacl::ocl::kernel kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("convert_data_to_channel_major")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)src1_vec, &ctx)); - kernel.arg(argIdx++, WrapHandle((cl_mem)src1, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_src1_vec = clState.get_buffer_mem(src1_vec); + ClMemOff buf_src1 = clState.get_buffer_mem(src1); + + kernel.arg(argIdx++, WrapHandle(buf_src1_vec.memobj, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src1.memobj, &ctx)); + kernel.arg(argIdx++, map_size); kernel.arg(argIdx++, ch_gr); #ifdef DEBUG_PROFILE @@ -389,9 +429,15 @@ void caffe_gpu_elementMulConj_Reshape(DtypeComplex* dst, // Batched complex number dot product size_t global_work_size2[2] = { (size_t)map_size, (size_t)out_gr }; argIdx = 0; - kernel_batchedCdotc.arg(argIdx++, WrapHandle((cl_mem)dst, &ctx)); - kernel_batchedCdotc.arg(argIdx++, WrapHandle((cl_mem)src1_vec, &ctx)); - kernel_batchedCdotc.arg(argIdx++, WrapHandle((cl_mem)src2, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_dst = clState.get_buffer_mem(dst); + ClMemOff buf_src1_vec = clState.get_buffer_mem(src1_vec); + ClMemOff buf_src2 = clState.get_buffer_mem(src2); + + kernel_batchedCdotc.arg(argIdx++, WrapHandle(buf_dst.memobj, &ctx)); + kernel_batchedCdotc.arg(argIdx++, WrapHandle(buf_src1_vec.memobj, &ctx)); + kernel_batchedCdotc.arg(argIdx++, WrapHandle(buf_src2.memobj, &ctx)); kernel_batchedCdotc.arg(argIdx++, map_size); kernel_batchedCdotc.arg(argIdx++, ch_gr); kernel_batchedCdotc.arg(argIdx++, out_gr); @@ -430,11 +476,17 @@ void caffe_gpu_elementMulConj_2D(DtypeComplex* dst, int dst_offset, viennacl::ocl::kernel kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("complex_conjugate_multiplication_2d")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)dst, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_dst = clState.get_buffer_mem(dst); + ClMemOff buf_src1 = clState.get_buffer_mem(src1); + ClMemOff buf_src2 = clState.get_buffer_mem(src2); + + kernel.arg(argIdx++, WrapHandle(buf_dst.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_dst << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src1, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src1.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src1 << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src2, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src2.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src2 << 1); kernel.arg(argIdx++, out_gr); kernel.arg(argIdx++, map_size >> 1); @@ -493,13 +545,19 @@ void caffe_gpu_elementMulConj_2D_SLM(DtypeComplex* dst, viennacl::ocl::kernel kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("complex_conjugate_multiplication_2d_SLM")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)dst, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_dst = clState.get_buffer_mem(dst); + ClMemOff buf_src1 = clState.get_buffer_mem(src1); + ClMemOff buf_src2 = clState.get_buffer_mem(src2); + + kernel.arg(argIdx++, WrapHandle(buf_dst.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_dst << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src1, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src1.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src1 << 1); kernel.arg( argIdx++, ch_gr * local_work_size_x * sizeof(Dtype) * 4); - kernel.arg(argIdx++, WrapHandle((cl_mem)src2, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src2.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src2 << 1); kernel.arg(argIdx++, out_gr); kernel.arg(argIdx++, map_float4_size); @@ -538,11 +596,17 @@ void caffe_gpu_elementMulConj_3D(DtypeComplex* dst, viennacl::ocl::kernel kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("complex_conjugate_multiplication_3d")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)dst, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_dst = clState.get_buffer_mem(dst); + ClMemOff buf_src1 = clState.get_buffer_mem(src1); + ClMemOff buf_src2 = clState.get_buffer_mem(src2); + + kernel.arg(argIdx++, WrapHandle(buf_dst.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_dst << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src1, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src1.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src1 << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src2, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src2.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src2 << 1); kernel.arg(argIdx++, out_gr); kernel.arg(argIdx++, map_size >> 1); @@ -599,15 +663,21 @@ void caffe_gpu_elementMulConj_3D_SLM(DtypeComplex* dst, viennacl::ocl::kernel kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("complex_conjugate_multiplication_3d_SLM")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)dst, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_dst = clState.get_buffer_mem(dst); + ClMemOff buf_src1 = clState.get_buffer_mem(src1); + ClMemOff buf_src2 = clState.get_buffer_mem(src2); + + kernel.arg(argIdx++, WrapHandle(buf_dst.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_dst << 1); kernel.arg( argIdx++, ch_gr * sizeof(Dtype) * 4); - kernel.arg(argIdx++, WrapHandle((cl_mem)src1, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src1.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src1 << 1); kernel.arg( argIdx++, ch_gr * local_work_size_x * sizeof(Dtype) * 4); - kernel.arg(argIdx++, WrapHandle((cl_mem)src2, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src2.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src2 << 1); kernel.arg(argIdx++, out_gr); kernel.arg(argIdx++, map_float4_size); @@ -644,11 +714,18 @@ void caffe_gpu_elementMul_1D(DtypeComplex* dst, viennacl::ocl::kernel kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("complex_multiplication_1d")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)dst, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_dst = clState.get_buffer_mem(dst); + ClMemOff buf_src1 = clState.get_buffer_mem(src1); + ClMemOff buf_src2 = clState.get_buffer_mem(src2); + + kernel.arg(argIdx++, WrapHandle(buf_dst.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_dst << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src1, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src1.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src1 << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src2, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src2.memobj, &ctx)); + kernel.arg(argIdx++, offset_offset_src2 << 1); kernel.arg(argIdx++, size >> 1); kernel.arg(argIdx++, ch_gr); @@ -700,12 +777,18 @@ void caffe_gpu_elementMul_2D_SLM(DtypeComplex* dst, viennacl::ocl::kernel kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("complex_multiplication_2d_SLM")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)dst, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_dst = clState.get_buffer_mem(dst); + ClMemOff buf_src1 = clState.get_buffer_mem(src1); + ClMemOff buf_src2 = clState.get_buffer_mem(src2); + + kernel.arg(argIdx++, WrapHandle(buf_dst.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_dst << 1); kernel.arg(argIdx++, local_mem_size_in_bytes); - kernel.arg(argIdx++, WrapHandle((cl_mem)src1, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src1.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src1 << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src2, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src2.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src2 << 1); kernel.arg(argIdx++, num_output); kernel.arg(argIdx++, map_size_in_dtype4); @@ -744,11 +827,18 @@ void caffe_gpu_elementMul_3D(DtypeComplex* dst, viennacl::ocl::kernel kernel = ctx.get_kernel("kernel_program", CL_KERNEL_SELECT("complex_multiplication_3d")); int argIdx = 0; - kernel.arg(argIdx++, WrapHandle((cl_mem)dst, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_dst = clState.get_buffer_mem(dst); + ClMemOff buf_src1 = clState.get_buffer_mem(src1); + ClMemOff buf_src2 = clState.get_buffer_mem(src2); + + kernel.arg(argIdx++, WrapHandle(buf_dst.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_dst << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src1, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src1.memobj, &ctx)); kernel.arg(argIdx++, offset_offset_src1 << 1); - kernel.arg(argIdx++, WrapHandle((cl_mem)src2, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src2.memobj, &ctx)); + kernel.arg(argIdx++, offset_offset_src2 << 1); kernel.arg(argIdx++, size); kernel.arg(argIdx++, ch_gr); @@ -852,8 +942,14 @@ void reshape_weights(DtypeComplex* dst, DtypeComplex* src, CL_KERNEL_SELECT("convert_weight_to_channel_major")); int argIdx = 0; size_t global_work_size[2] = { (size_t)size, (size_t)num_output }; - kernel.arg(argIdx++, WrapHandle((cl_mem)dst, &ctx)); - kernel.arg(argIdx++, WrapHandle((cl_mem)src, &ctx)); + + ClState& clState = Caffe::cl_state(); + ClMemOff buf_dst = clState.get_buffer_mem(dst); + ClMemOff buf_src = clState.get_buffer_mem(src); + + kernel.arg(argIdx++, WrapHandle(buf_dst.memobj, &ctx)); + kernel.arg(argIdx++, WrapHandle(buf_src.memobj, &ctx)); + kernel.arg(argIdx++, size); kernel.arg(argIdx++, ch_gr); kernel.arg(argIdx++, num_output); diff --git a/src/caffe/util/cl_state.cpp b/src/caffe/util/cl_state.cpp new file mode 100644 index 00000000000..67cae26246d --- /dev/null +++ b/src/caffe/util/cl_state.cpp @@ -0,0 +1,192 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "caffe/greentea/cl_kernels.hpp" +#include "caffe/util/cl_state.hpp" + +using std::find; +using std::endl; +using std::map; +using std::make_pair; +using std::pair; +using std::ostream; +using std::string; +using std::vector; + +namespace caffe { + +struct ClState::Impl { + explicit Impl() { + } + + void initialize() { + free_mem_[NULL] = static_cast(1) << (sizeof (size_t) * 8 - 1); + } + + void* create_buffer(int dev_id, cl_mem_flags flags, size_t size, + void* host_ptr, cl_int *errcode) { + viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_id); + cl_mem memobj = clCreateBuffer(ctx.handle().get(), flags, + size, host_ptr, errcode); + void* buffer = get_memptr_from_freemem(size); + memobjs_[buffer] = make_pair(memobj, size); + memdev_[memobj] = dev_id; + + return buffer; + } + + void* get_memptr_from_freemem(size_t size) { + map::iterator it = find_best_fit_free_mem(size); + void* memptr = static_cast(it->first) - size; + + if (it->second > size) + free_mem_[memptr] = it->second - size; + free_mem_.erase(it); + + return memptr; + } + + map::iterator find_best_fit_free_mem(size_t size) { + map::iterator fit = free_mem_.end(); + for (map::iterator it = free_mem_.begin(); + it != free_mem_.end(); ++it) { + if (it->second >= size && + (fit == free_mem_.end() || it->second < fit->second)) + fit = it; + } + + if (fit == free_mem_.end()) + LOG(FATAL) << "Unable to find free memory"; + + return fit; + } + + size_t get_buffer_size(const void* buffer) { + map >::iterator it = + memobjs_.find(const_cast(buffer)); + + if (it == memobjs_.end()) { + LOG(FATAL) << "Invalid buffer object"; + } + return it->second.second; + } + + ClMemOff get_buffer_mem(const void* ptr) { + const char* cptr = static_cast(ptr); + for (map >::iterator it = memobjs_.begin(); + it != memobjs_.end(); ++it) { + const char* buffer = static_cast(it->first); + cl_mem mem = it->second.first; + int size = it->second.second; + + if (cptr >= buffer && (cptr - buffer) < size) + return ClMemOff{mem, static_cast(cptr - buffer)}; + } + return ClMemOff{NULL, 0}; + } + + int get_mem_dev(cl_mem memobj) { + return memdev_[memobj]; + } + + void destroy_buffer(void* buffer) { + map >::iterator it1 = memobjs_.find(buffer); + if (it1 == memobjs_.end()) + LOG(FATAL) << "Invalid buffer"; + + cl_mem mem = it1->second.first; + int size = it1->second.second; + memobjs_.erase(it1); + + map::iterator it2 = memdev_.find(mem); + memdev_.erase(it2); + + free_mem_[static_cast(buffer) + size] = size; + + combine_free_mem(); + + clReleaseMemObject(mem); + } + + void combine_free_mem() { + for (size_t prevSize = 0; free_mem_.size() != prevSize;) { + prevSize = free_mem_.size(); + + for (map::iterator it = free_mem_.begin(); + it != free_mem_.end(); ++it) { + map::iterator it2 = it; + ++it2; + + if (it2 == free_mem_.end()) + break; + + if (it->first == NULL) { + if (static_cast(it2->first) + it->second == NULL) { + it->second += it2->second; + free_mem_.erase(it2); + break; + } + } else if (static_cast(it->first) + it2->second == it2->first) { + it2->second += it->second; + free_mem_.erase(it); + break; + } + } + } + } + + + map > memobjs_; + map free_mem_; + map memdev_; +}; + +ClState::ClState() { + impl_ = new Impl(); + impl_->initialize(); +} + +ClState::~ClState() { + if (impl_ != NULL) + delete impl_; +} + +void* ClState::create_buffer(int dev_id, cl_mem_flags flags, size_t size, + void* host_ptr, cl_int *errcode) { + return impl_->create_buffer(dev_id, flags, size, host_ptr, errcode); +} + +void ClState::destroy_buffer(void* buffer) { + impl_->destroy_buffer(buffer); +} + +size_t ClState::get_buffer_size(const void* buffer) { + return impl_->get_buffer_size(buffer); +} + +ClMemOff ClState::get_buffer_mem(const void* ptr) { + return impl_->get_buffer_mem(ptr); +} + +int ClState::get_mem_dev(cl_mem memobj) { + return impl_->get_mem_dev(memobj); +} + +cl_mem ClState::create_subbuffer(const void* ptr, size_t offset, + cl_mem_flags flags) { + ClMemOff buf = get_buffer_mem(ptr); + size_t size = get_buffer_size(static_cast(ptr) - buf.offset); + cl_buffer_region bufReg = { offset, size - offset }; + cl_int err; + cl_mem sub_buf = clCreateSubBuffer(buf.memobj, flags, + CL_BUFFER_CREATE_TYPE_REGION, &bufReg, &err); + return sub_buf; +} +} // namespace caffe diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 6ab7062b542..72607210259 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -117,6 +117,9 @@ void caffe_copy(const int_tp N, const Dtype* X, Dtype* Y) { // NOLINT_NEXT_LINE(caffe/alt_fn) CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); #endif // USE_CUDA +#ifdef USE_GREENTEA + caffe_gpu_memcpy(N * sizeof(Dtype), X, Y); +#endif #else NO_GPU; #endif