diff --git a/lc0@exe/lc0@exe.log b/lc0@exe/lc0@exe.log new file mode 100644 index 0000000000..8a4dd03e76 --- /dev/null +++ b/lc0@exe/lc0@exe.log @@ -0,0 +1,2 @@ + layers.cc + lc0@exe.vcxproj -> C:\Ankan\git\ankan\lc0\build\.\lc0.exe diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu index da986c69a3..7415423a32 100644 --- a/src/neural/cuda/common_kernels.cu +++ b/src/neural/cuda/common_kernels.cu @@ -25,6 +25,7 @@ Program grant you additional permission to convey the resulting work. */ +#include #include "cuda_common.h" #include "neural/network.h" @@ -209,10 +210,10 @@ __global__ void expandPlanes_kernel_Fp32_NCHW(float* output, const uint64_t* masks, const float* values, int n) { // Block size of 256, same mask/val for 64 consecutive threads. - constexpr int kNumShmemElments = 256 / 64; + constexpr int kNumShmemElements = 256 / 64; - __shared__ uint64_t shMasks[kNumShmemElments]; - __shared__ float shVals[kNumShmemElments]; + __shared__ uint64_t shMasks[kNumShmemElements]; + __shared__ float shVals[kNumShmemElements]; int index = threadIdx.x + blockDim.x * blockIdx.x; @@ -221,7 +222,7 @@ __global__ void expandPlanes_kernel_Fp32_NCHW(float* output, if (planeIndex >= n) return; // Load inputs to shared memory. - if (threadIdx.x < kNumShmemElments) { + if (threadIdx.x < kNumShmemElements) { shMasks[threadIdx.x] = masks[planeIndex + threadIdx.x]; shVals[threadIdx.x] = values[planeIndex + threadIdx.x]; } @@ -281,10 +282,54 @@ void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks, ReportCUDAErrors(cudaGetLastError()); } -__global__ void globalScale_kernel(float* output, const float* input, - const float* scaleBias, - const float* prevLayerBias, int inputSize, - int C) { +__global__ void expandPlanes_kernel_Fp16_NCHW(half* output, + const uint64_t* masks, + const float* values, int n) { + // block size of 256, same mask/val for 64 consecutive threads + constexpr int kNumShmemElements = 256 / 64; + + __shared__ uint64_t shMasks[kNumShmemElements]; + __shared__ half shVals[kNumShmemElements]; + + int index = threadIdx.x + blockDim.x * blockIdx.x; + + int planeIndex = index >> 6; + + if (planeIndex >= n) return; + + // load inputs to shared memory + if (threadIdx.x < kNumShmemElements) { + shMasks[threadIdx.x] = masks[planeIndex + threadIdx.x]; + shVals[threadIdx.x] = values[planeIndex + threadIdx.x]; + } + __syncthreads(); + + uint64_t mask = shMasks[threadIdx.x >> 6]; + + int sqIndex = index & 0x3F; + half op = 0; + + bool set = !!(mask & (1ull << sqIndex)); + if (set) { + op = (half)shVals[threadIdx.x >> 6]; + } + output[index] = op; +} + +void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks, + const float* values, int n) { + int threads = n * 8 * 8; // each thread writes a single element + const int blockSize = 256; + int blocks = DivUp(threads, blockSize); + expandPlanes_kernel_Fp16_NCHW<<>>(output, masks, values, + n); + ReportCUDAErrors(cudaGetLastError()); +} + +template +__global__ void globalScale_kernel(T* output, const T* input, + const T* scaleBias, const T* prevLayerBias, + int inputSize, int C) { const int kPlaneSize = 64; int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -299,7 +344,7 @@ __global__ void globalScale_kernel(float* output, const float* input, float val2 = output[tid]; // Skip connection to be added directly. if (prevLayerBias) { - val1 += prevLayerBias[c]; + val1 += (float)(prevLayerBias[c]); } int startIdx = n * 2 * C; // Scale and bias interleaved. @@ -311,7 +356,7 @@ __global__ void globalScale_kernel(float* output, const float* input, float op = val1 * s + val2 + b; if (op < 0) op = 0; - output[tid] = op; + output[tid] = (T)op; } __global__ void globalScale_kernel_fp16_nhwc(half* output, const half* input, @@ -374,8 +419,9 @@ __global__ void globalAvgPool_kernel_NHWC_fp16(half* output, const half* input, } // Each thread reads 2 inputs (8x8/32), and each warp writes a single output. -__global__ void globalAvgPool_kernel(float* output, const float* input, - const float* prevLayerBias, int inputSize, +template +__global__ void globalAvgPool_kernel(T* output, const T* input, + const T* prevLayerBias, int inputSize, int outputSize, int C) { const int elementsPerWarp = 64; const int elementsPerThread = 2; @@ -391,7 +437,7 @@ __global__ void globalAvgPool_kernel(float* output, const float* input, #pragma unroll for (int i = 0; i < elementsPerWarp; i += 32) { int index = laneStartIndex + laneId + i; - if (index < inputSize) S += input[index]; + if (index < inputSize) S += (float)(input[index]); } // Compute warp wide sum (for entire plane - elementsPerWarp elements). @@ -406,19 +452,20 @@ __global__ void globalAvgPool_kernel(float* output, const float* input, // First thread in warp has the sum, write it in output. if (laneId == 0) { if (opIndex < outputSize) { - if (prevLayerBias) avg += prevLayerBias[opIndex % C]; - output[opIndex] = avg; + if (prevLayerBias) avg += (float)prevLayerBias[opIndex % C]; + output[opIndex] = (T)avg; } } } template void globalAvgPool(int N, int C, T* output, const T* input, - const T* prevLayerBias) { + const T* prevLayerBias, bool nhwc) { const int kPlaneSize = 64; const bool fp16 = std::is_same::value; - if (fp16) { + if (nhwc) { + assert(fp16); // For NHWC fp16, simply launch N blocks, each with C threads. globalAvgPool_kernel_NHWC_fp16<<>>((half*)output, (half*)input, (half*)prevLayerBias, @@ -433,8 +480,7 @@ void globalAvgPool(int N, int C, T* output, const T* input, const int kBlockSize = kWarpsPerBlock * 32; int blocks = DivUp(kTotalWarps, kWarpsPerBlock); - globalAvgPool_kernel<<>>((float*)output, (float*)input, - (float*)prevLayerBias, + globalAvgPool_kernel<<>>(output, input, prevLayerBias, N * C * kPlaneSize, N * C, C); } ReportCUDAErrors(cudaGetLastError()); @@ -442,21 +488,21 @@ void globalAvgPool(int N, int C, T* output, const T* input, template void globalScale(int N, int C, T* output, const T* input, const T* scaleBias, - const T* prevLayerBias) { + const T* prevLayerBias, bool nhwc) { const bool fp16 = std::is_same::value; // Each thread writes one output. const int kBlockSize = 256; const int kBlocks = DivUp(N * 8 * 8 * C, kBlockSize); - if (fp16) { + if (nhwc) { + assert(fp16); globalScale_kernel_fp16_nhwc<<>>( (half*)output, (half*)input, (half*)scaleBias, (half*)prevLayerBias, N * C * 8 * 8, C, 8 * 8 * C); } else { globalScale_kernel<<>>( - (float*)output, (float*)input, (float*)scaleBias, (float*)prevLayerBias, - N * C * 8 * 8, C); + output, input, scaleBias, prevLayerBias, N * C * 8 * 8, C); } ReportCUDAErrors(cudaGetLastError()); } @@ -520,16 +566,16 @@ template void addBias_NCHW(half* c, half* a, half* b, int N, int C, int H, template void globalAvgPool(int N, int C, float* output, const float* input, - const float* prevLayerBias); + const float* prevLayerBias, bool nhwc); template void globalAvgPool(int N, int C, half* output, const half* input, - const half* prevLayerBias); + const half* prevLayerBias, bool nhwc); template void globalScale(int N, int C, float* output, const float* input, const float* scaleBias, - const float* prevLayerBias); + const float* prevLayerBias, bool nhwc); template void globalScale(int N, int C, half* output, const half* input, const half* scaleBias, - const half* prevLayerBias); + const half* prevLayerBias, bool nhwc); template void PolicyMap(int N, float* output, const float* input, const short* indices, int inputSize, diff --git a/src/neural/cuda/kernels.h b/src/neural/cuda/kernels.h index d98936d9ec..6fce88e31f 100644 --- a/src/neural/cuda/kernels.h +++ b/src/neural/cuda/kernels.h @@ -59,15 +59,18 @@ void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks, void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks, const float* values, int n); +void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks, + const float* values, int n); + // Perform global avg pool. template void globalAvgPool(int N, int C, T* output, const T* input, - const T* prevLayerBias); + const T* prevLayerBias, bool nhwc); // Perform global scale. template void globalScale(int N, int C, T* output, const T* input, const T* scaleBias, - const T* prevLayerBias); + const T* prevLayerBias, bool nhwc); // Perform Squeeze-and-Excitation (SE) in a single fused kernel. // Returns false if the fused kernel can't handle the sizes. diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc index 0678f068e1..a1f42b3151 100644 --- a/src/neural/cuda/layers.cc +++ b/src/neural/cuda/layers.cc @@ -24,23 +24,27 @@ terms of the respective license agreement, the licensors of this Program grant you additional permission to convey the resulting work. */ +#include "layers.h" #include #include #include #include "cuda_common.h" #include "kernels.h" -#include "layers.h" namespace lczero { namespace cudnn_backend { // Use Single kernel for entire SE operation. -// Right now supported only for fp16 and it's quite a bit faster +// Right now supported only for fp16 with nhwc and it's quite a bit faster // than using multiple passes. The flag can be set to false for debugging. static constexpr bool kUseFusedSELayer = true; +template +BaseLayer::BaseLayer(int c, int h, int w, BaseLayer* ip, bool nhwc) + : input_(ip), C(c), H(h), W(w), nhwc_(nhwc) {} + template BaseLayer::BaseLayer(int c, int h, int w, BaseLayer* ip) - : input_(ip), C(c), H(h), W(w) {} + : input_(ip), C(c), H(h), W(w), nhwc_(ip->nhwc_) {} template SoftMaxLayer::SoftMaxLayer(BaseLayer* ip) @@ -49,8 +53,7 @@ SoftMaxLayer::SoftMaxLayer(BaseLayer* ip) } template -SoftMaxLayer::~SoftMaxLayer() -{ +SoftMaxLayer::~SoftMaxLayer() { cudnnDestroyTensorDescriptor(out_tensor_desc_); } @@ -62,14 +65,15 @@ void SoftMaxLayer::Eval(int N, DataType* output, cublasHandle_t /*cublas*/) { float alpha = 1.0f, beta = 0.0f; + const cudnnDataType_t dataType = + std::is_same::value ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT; + + const cudnnTensorFormat_t layout = + nhwc_ ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW; + // Need to call this at Eval as 'N' changes :-/ - if (std::is_same::value) { - cudnnSetTensor4dDescriptor(out_tensor_desc_, CUDNN_TENSOR_NHWC, - CUDNN_DATA_HALF, N, GetC(), GetH(), GetW()); - } else { - cudnnSetTensor4dDescriptor(out_tensor_desc_, CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, N, GetC(), GetH(), GetW()); - } + cudnnSetTensor4dDescriptor(out_tensor_desc_, layout, dataType, N, GetC(), + GetH(), GetW()); cudnnSoftmaxForward(cudnn, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, &alpha, out_tensor_desc_, @@ -77,21 +81,21 @@ void SoftMaxLayer::Eval(int N, DataType* output, } template -ConvLayer::ConvLayer(BaseLayer* ip, int C, int H, int W, - int filter, int Cin, bool relu, bool bias) - : BaseLayer(C, H, W, ip), - c_input_(Cin), - filter_size_(filter), - use_relu_(relu), - use_bias_(bias) { +void ConvLayer::init() { // Allocate memory for weights (filter tensor) and biases. - const size_t weight_size = sizeof(DataType) * Cin * C * filter_size_ * filter_size_; + const size_t weight_size = + sizeof(DataType) * c_input_ * C * filter_size_ * filter_size_; ReportCUDAErrors(cudaMalloc(&weights, weight_size)); const size_t blas_size = sizeof(DataType) * C; ReportCUDAErrors(cudaMalloc(&biases, blas_size)); const bool fp16 = std::is_same::value; + const cudnnDataType_t dataType = + std::is_same::value ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT; + + const cudnnTensorFormat_t layout = + nhwc_ ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW; // Create cudnn objects for various tensors, algorithms, etc. cudnnCreateFilterDescriptor(&filter_desc_); @@ -101,29 +105,25 @@ ConvLayer::ConvLayer(BaseLayer* ip, int C, int H, int W, cudnnCreateTensorDescriptor(&bias_desc_); cudnnCreateActivationDescriptor(&activation_); - cudnnSetFilter4dDescriptor(filter_desc_, - fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT, - fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW, - GetC(), Cin, filter_size_, filter_size_); + cudnnSetFilter4dDescriptor(filter_desc_, dataType, layout, GetC(), c_input_, + filter_size_, filter_size_); - ReportCUDNNErrors(cudnnSetTensor4dDescriptor( - bias_desc_, fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW, - fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT, 1, C, 1, 1)); + ReportCUDNNErrors( + cudnnSetTensor4dDescriptor(bias_desc_, layout, dataType, 1, C, 1, 1)); const int padding = filter_size_ / 2; const bool crossCorr = 1; ReportCUDNNErrors(cudnnSetConvolution2dDescriptor( conv_desc_, padding, padding, 1, 1, 1, 1, - crossCorr ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION, - fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT)); + crossCorr ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION, dataType)); - if (fp16) + if (fp16 && nhwc_) ReportCUDNNErrors( cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH)); // TODO: dynamic selection of algorithm! - if ((C > 32) && (!fp16) && (filter_size_ > 1)) { + if ((C > 32) && (!nhwc_) && (filter_size_ > 1)) { conv_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; } else { conv_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; @@ -141,6 +141,28 @@ ConvLayer::ConvLayer(BaseLayer* ip, int C, int H, int W, #endif } +template +ConvLayer::ConvLayer(BaseLayer* ip, int C, int H, int W, + int filter, int Cin, bool relu, bool bias) + : BaseLayer(C, H, W, ip), + c_input_(Cin), + filter_size_(filter), + use_relu_(relu), + use_bias_(bias) { + init(); +} + +template +ConvLayer::ConvLayer(bool nhwc, int C, int H, int W, int filter, + int Cin, bool relu, bool bias) + : BaseLayer(C, H, W, nullptr, nhwc), + c_input_(Cin), + filter_size_(filter), + use_relu_(relu), + use_bias_(bias) { + init(); +} + template <> void ConvLayer::LoadWeights(float* pfilter, float* pBias, void* scratch) { const size_t weight_size = @@ -152,8 +174,14 @@ void ConvLayer::LoadWeights(float* pfilter, float* pBias, void* scratch) { assert(scratch); ReportCUDAErrors( cudaMemcpy(scratch, pfilter, weight_size, cudaMemcpyHostToDevice)); - fp32NCHWtofp16NHWC((half*)weights, (float*)scratch, C, c_input_, C, c_input_, - filter_size_, filter_size_); + + if (nhwc_) { + fp32NCHWtofp16NHWC((half*)weights, (float*)scratch, C, c_input_, C, + c_input_, filter_size_, filter_size_); + } else { + copyTypeConverted((half*)weights, (float*)scratch, + C * c_input_ * filter_size_ * filter_size_); + } if (pBias) { ReportCUDAErrors( @@ -185,15 +213,17 @@ void ConvLayer::Eval(int N, DataType* output, const DataType* input, const DataType* input2, void* scratch, size_t scratch_size, cudnnHandle_t cudnn, cublasHandle_t /*cublas*/) { - const bool fp16 = std::is_same::value; + const cudnnDataType_t dataType = + std::is_same::value ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT; - ReportCUDNNErrors(cudnnSetTensor4dDescriptor( - out_tensor_desc_, fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW, - fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT, N, C, H, W)); + const cudnnTensorFormat_t layout = + nhwc_ ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW; - ReportCUDNNErrors(cudnnSetTensor4dDescriptor( - in_tensor_desc_, fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW, - fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT, N, c_input_, H, W)); + ReportCUDNNErrors(cudnnSetTensor4dDescriptor(out_tensor_desc_, layout, + dataType, N, C, H, W)); + + ReportCUDNNErrors(cudnnSetTensor4dDescriptor(in_tensor_desc_, layout, + dataType, N, c_input_, H, W)); float alpha = 1.0f, beta = 0.0f; @@ -211,10 +241,10 @@ void ConvLayer::Eval(int N, DataType* output, const DataType* input, conv_desc_, conv_algo_, scratch, scratch_size, &alpha, out_tensor_desc_, input2, bias_desc_, biases, activation_, out_tensor_desc_, output)); } else { - // For some reason cudnn doesn't support just Convolution + Bias with fp32 + // For some reason cudnn doesn't support just Convolution + Bias with nchw // (winograd algorithm) it works fine when RELU is also needed which is // somewhat strange. - if ((std::is_same::value) && (!use_relu_)) { + if ((!nhwc_) && (!use_relu_)) { ReportCUDNNErrors(cudnnConvolutionForward( cudnn, &alpha, in_tensor_desc_, input, filter_desc_, weights, conv_desc_, conv_algo_, scratch, scratch_size, &beta, @@ -265,47 +295,6 @@ ConvLayer::~ConvLayer() { cudnnDestroyActivationDescriptor(activation_); } -template -BNLayer::BNLayer(BaseLayer* ip, bool relu) - : BaseLayer(ip->GetC(), ip->GetH(), ip->GetW(), ip), - use_relu_(relu) { - const size_t weight_size = sizeof(float) * C; - - ReportCUDAErrors(cudaMalloc(&means_, weight_size)); - ReportCUDAErrors(cudaMalloc(&variances_, weight_size)); -} - -template -void BNLayer::LoadWeights(float* cpuMeans, float* cpuVar) { - const size_t weight_size = sizeof(float) * C; - ReportCUDAErrors( - cudaMemcpy(means_, cpuMeans, weight_size, cudaMemcpyHostToDevice)); - ReportCUDAErrors( - cudaMemcpy(variances_, cpuVar, weight_size, cudaMemcpyHostToDevice)); -} - -template <> -void BNLayer::Eval(int N, half* output, const half* input, - const half* input2, void* /*scratch*/, - size_t /*scratch_size*/, cudnnHandle_t /*cudnn*/, - cublasHandle_t /*cublas*/) { - batchNorm(output, input, input2, N, C, H, W, means_, variances_, use_relu_); -} - -template <> -void BNLayer::Eval(int N, float* output, const float* input, - const float* input2, void* /*scratch*/, - size_t /*scratch_size*/, cudnnHandle_t /*cudnn*/, - cublasHandle_t /*cublas*/) { - batchNorm(output, input, input2, N, C, H, W, means_, variances_, use_relu_); -} - -template -BNLayer::~BNLayer() { - ReportCUDAErrors(cudaFree(means_)); - ReportCUDAErrors(cudaFree(variances_)); -} - template SELayer::SELayer(BaseLayer* ip, int fc1Outputs, bool addPrevLayerBias) @@ -377,7 +366,7 @@ void SELayer::LoadWeights(float* w1, float* b1, float* w2, float* b2, std::vector temp(weight_size2); // Weight for the first FC layer. - if (kUseFusedSELayer) { + if (kUseFusedSELayer && nhwc_) { cpuTranspose(temp.data(), w1, numFc1Out_, C); ReportCUDAErrors( cudaMemcpy(scratch, temp.data(), weight_size1, cudaMemcpyHostToDevice)); @@ -388,7 +377,7 @@ void SELayer::LoadWeights(float* w1, float* b1, float* w2, float* b2, copyTypeConverted((half*)w1_, (float*)scratch, num_weights1); // Weight for the second FC layer. - if (kUseFusedSELayer) { + if (kUseFusedSELayer && nhwc_) { cpuTranspose(temp.data(), w2, 2 * C, numFc1Out_); ReportCUDAErrors( cudaMemcpy(scratch, temp.data(), weight_size2, cudaMemcpyHostToDevice)); @@ -427,7 +416,7 @@ void SELayer::Eval(int N, float* output, const float* input, // 1. Global avg pooling (also adds previous layer bias before computing // averages). - globalAvgPool(N, C, op2, input, bPrev_); + globalAvgPool(N, C, op2, input, bPrev_, false); // 2. First fully connected layer. float alpha = 1.0f, beta = 0.0f; @@ -445,7 +434,7 @@ void SELayer::Eval(int N, float* output, const float* input, // 4. (Optional prev layer bias add), Global scale, residual add, relu and // bias. - globalScale(N, C, output, input, op2, bPrev_); + globalScale(N, C, output, input, op2, bPrev_, false); } template <> @@ -453,10 +442,10 @@ void SELayer::Eval(int N, half* output, const half* input, const half* input2, void* scratch, size_t scratch_size, cudnnHandle_t /*cudnn*/, cublasHandle_t cublas) { bool se_done = false; - if (kUseFusedSELayer) { + if (kUseFusedSELayer && nhwc_) { se_done = Se_Fp16_NHWC(N, C, numFc1Out_, output, input2, input, w1_, b1_, w2_, b2_, bPrev_); - } + } if (!se_done) { assert(output == input2); // Ping-pong between 'op1' and 'op2' (parts of scratch memory). @@ -465,7 +454,7 @@ void SELayer::Eval(int N, half* output, const half* input, // 1. Global avg pooling (also adds previous layer bias before computing // averages). - globalAvgPool(N, C, op2, input, bPrev_); + globalAvgPool(N, C, op2, input, bPrev_, nhwc_); // 2. First fully connected layer. __half_raw one_h{0x3C00}; @@ -486,7 +475,7 @@ void SELayer::Eval(int N, half* output, const half* input, // 4. (Optional prev layer bias add), Global scale, residual add, relu and // bias. - globalScale(N, C, output, input, op2, bPrev_); + globalScale(N, C, output, input, op2, bPrev_, nhwc_); } } @@ -498,9 +487,9 @@ FCLayer::FCLayer(BaseLayer* ip, int C, int H, int W, use_relu_(relu), use_tanh_(tanh), use_sigmoid_(sigmoid) { - const size_t weight_size = + const size_t weight_size = sizeof(DataType) * C * H * W * ip->GetC() * ip->GetH() * ip->GetW(); - const size_t blas_size = sizeof(DataType) * C * H * W; + const size_t blas_size = sizeof(DataType) * C * H * W; ReportCUDAErrors(cudaMalloc(&weights_, weight_size)); if (use_bias_) { ReportCUDAErrors(cudaMalloc(&biases_, blas_size)); @@ -523,9 +512,13 @@ void FCLayer::LoadWeights(float* cpuWeight, float* cpuBias, ReportCUDAErrors( cudaMemcpy(scratch, cpuWeight, weight_size, cudaMemcpyHostToDevice)); - fp32NCHWtofp16NHWC((half*)weights_, (float*)scratch, num_biases, - input_->GetC(), num_biases, input_->GetC(), input_->GetH(), - input_->GetW()); + if (nhwc_) { + fp32NCHWtofp16NHWC((half*)weights_, (float*)scratch, num_biases, + input_->GetC(), num_biases, input_->GetC(), + input_->GetH(), input_->GetW()); + } else { + copyTypeConverted((half*)weights_, (float*)scratch, num_weights); + } if (cpuBias) { ReportCUDAErrors( @@ -616,7 +609,7 @@ void PolicyMapLayer::LoadWeights(const short* cpuWeight, void* /*scratch*/) { size_t weight_size = sizeof(short) * used_size_; - if (std::is_same::value) { + if (nhwc_) { // convert CHW to HWC int C = used_size_ / 64; int Cin = this->input_->GetC(); @@ -712,9 +705,6 @@ template class ConvLayer; template class FCLayer; template class FCLayer; -template class BNLayer; -template class BNLayer; - template class SoftMaxLayer; template class SoftMaxLayer; diff --git a/src/neural/cuda/layers.h b/src/neural/cuda/layers.h index a470c23daa..a4d02a748e 100644 --- a/src/neural/cuda/layers.h +++ b/src/neural/cuda/layers.h @@ -44,6 +44,7 @@ class BaseLayer { int GetW() const { return W; } BaseLayer(int c, int h, int w, BaseLayer* ip); + BaseLayer(int c, int h, int w, BaseLayer* ip, bool nhwc); virtual ~BaseLayer() = default; size_t GetOutputSize(int N) const { return sizeof(DataType) * N * C * H * W; } @@ -58,6 +59,8 @@ class BaseLayer { int C; // Output tensor dimensions. int H; int W; + + bool nhwc_; // tensor layout }; template @@ -68,10 +71,15 @@ class ConvLayer : public BaseLayer { using BaseLayer::GetC; using BaseLayer::GetH; using BaseLayer::GetW; + using BaseLayer::nhwc_; public: ConvLayer(BaseLayer* ip, int C, int H, int W, int size, int Cin, bool relu = false, bool bias = false); + + ConvLayer(bool nhwc, int C, int H, int W, int size, int Cin, + bool relu = false, bool bias = false); + ~ConvLayer(); void LoadWeights(float* pfilter, float* pBias, void* scratch); void Eval(int N, DataType* output, const DataType* input, @@ -95,6 +103,8 @@ class ConvLayer : public BaseLayer { cudnnTensorDescriptor_t in_tensor_desc_; cudnnTensorDescriptor_t out_tensor_desc_; cudnnActivationDescriptor_t activation_; + + void init(); }; template @@ -102,6 +112,7 @@ class SoftMaxLayer : public BaseLayer { using BaseLayer::GetC; using BaseLayer::GetH; using BaseLayer::GetW; + using BaseLayer::nhwc_; public: SoftMaxLayer(BaseLayer* ip); @@ -114,30 +125,10 @@ class SoftMaxLayer : public BaseLayer { cudnnTensorDescriptor_t out_tensor_desc_; }; -template -class BNLayer : public BaseLayer { - using BaseLayer::C; - - public: - BNLayer(BaseLayer* ip, bool relu); - ~BNLayer(); - - void LoadWeights(float* cpuMeans, float* cpuVar); - void Eval(int N, DataType* output, const DataType* input, - const DataType* input2, void* scratch, size_t scratch_size, - cudnnHandle_t cudnn, cublasHandle_t cublas) override; - - private: - const bool use_relu_; - - // Weights for BN layer are always in float irrespective of DataType - // as there is not much point in converting these to fp16. - float* means_ = nullptr; - float* variances_ = nullptr; -}; - template class FCLayer : public BaseLayer { + using BaseLayer::nhwc_; + public: FCLayer(BaseLayer* ip, int C, int H, int W, bool relu, bool bias, bool tanh = false, bool sigmoid = false); @@ -159,6 +150,8 @@ class FCLayer : public BaseLayer { template class PolicyMapLayer: public BaseLayer { + using BaseLayer::nhwc_; + public: PolicyMapLayer(BaseLayer* ip, int C, int H, int W, int usedSize); ~PolicyMapLayer(); @@ -181,6 +174,7 @@ class PolicyMapLayer: public BaseLayer { template class SELayer : public BaseLayer { using BaseLayer::C; + using BaseLayer::nhwc_; public: SELayer(BaseLayer* ip, int numFc1Out, diff --git a/src/neural/cuda/network_cudnn.cc b/src/neural/cuda/network_cudnn.cc index 1bfb4c633b..d41cd50d75 100644 --- a/src/neural/cuda/network_cudnn.cc +++ b/src/neural/cuda/network_cudnn.cc @@ -46,6 +46,36 @@ using namespace cudnn_backend; static constexpr int kNumOutputPolicy = 1858; +#if 0 +// debug code to dump allocation in GPU memory +void dumpTensor(void *memory, int elements, char *message, bool fp16 = false) +{ + printf("\n%s\n", message); + int elementSize = (int) (fp16 ? sizeof(half) : sizeof(float)); + int bytes = elements * elementSize; + void *temp = malloc(bytes); + cudaMemcpy(temp, memory, bytes, cudaMemcpyDeviceToHost); + + for (int i = 0; i < elements; i++) + { + float val; + if (fp16) + { + half *arr = (half*)temp; + val = (float)arr[i]; + } + else + { + float *arr = (float *)temp; + val = arr[i]; + } + printf("%10.4f ", val); + } + free(temp); + printf("\n"); +} +#endif + struct InputsOutputs { InputsOutputs(int maxBatchSize) { ReportCUDAErrors(cudaHostAlloc( @@ -184,14 +214,35 @@ class CudnnNetwork : public Network { ReportCUDNNErrors(cudnnCreate(&cudnn_)); ReportCUBLASErrors(cublasCreate(&cublas_)); + // Default layout is nchw. + nhwc_ = false; + if (std::is_same::value) { - // Check if the GPU support fp16 (Volta+). - if (deviceProp.major >= 7) { - // Enable Tensor cores! - ReportCUBLASErrors(cublasSetMathMode(cublas_, CUBLAS_TENSOR_OP_MATH)); + // Check if the GPU support FP16. + + if (deviceProp.major == 6 && deviceProp.minor == 0) { + // FP16 without tensor cores supported on GP100 (SM 6.0) + // nhwc_ remains false. + } else if (deviceProp.major >= 7) { + // NHWC layout is faster with Tensor Cores. + // Supported on Volta and Turing (and hopefully future GPUs too). + + // Some GPUs (GTX 16xx) are SM 7.5 but don't have tensor cores + // enabling TENSOR_OP_MATH or nhwc_ layout for them works but is + // very very slow (likely because the system emulates it). + if (!strstr(deviceProp.name, "GTX 16")) { + nhwc_ = true; + } } else { throw Exception("Your GPU doesn't support FP16"); } + + // Override if forced from backend option + if (!options.IsDefault("nhwc")) + nhwc_ = options.Get("nhwc"); + + if (nhwc_) + ReportCUBLASErrors(cublasSetMathMode(cublas_, CUBLAS_TENSOR_OP_MATH)); } const int kNumInputPlanes = kInputPlanes; @@ -218,24 +269,26 @@ class CudnnNetwork : public Network { const int maxChannels = std::max(kInputPlanes, kNumFilters); - const bool fp16 = std::is_same::value; + const cudnnDataType_t datatype = std::is_same::value + ? CUDNN_DATA_HALF + : CUDNN_DATA_FLOAT; + const cudnnTensorFormat_t layout = + nhwc_ ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW; + ReportCUDNNErrors(cudnnSetFilter4dDescriptor( - wDesc, fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT, - fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW, maxChannels, maxChannels, - 3, 3)); + wDesc, datatype, layout, maxChannels, maxChannels, 3, 3)); ReportCUDNNErrors(cudnnSetTensor4dDescriptor( - xDesc, fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW, - fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT, max_batch_size_, maxChannels, - 8, 8)); + xDesc, layout, datatype, max_batch_size_, maxChannels, 8, 8)); ReportCUDNNErrors(cudnnSetConvolution2dDescriptor( - convDesc, 1, 1, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION, - fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT)); + convDesc, 1, 1, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION, datatype)); - if (fp16) { - ReportCUDNNErrors( - cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH)); + // It will fall back to non-tensor math if not supported. + ReportCUDNNErrors( + cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH)); + + if (nhwc_) { conv_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; } else { conv_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; @@ -259,7 +312,7 @@ class CudnnNetwork : public Network { // Input. { auto inputConv = std::make_unique>( - nullptr, kNumFilters, 8, 8, 3, kNumInputPlanes, true, true); + nhwc_, kNumFilters, 8, 8, 3, kNumInputPlanes, true, true); inputConv->LoadWeights(&weights.input.weights[0], &weights.input.biases[0], scratch_mem_); network_.emplace_back(std::move(inputConv)); @@ -411,14 +464,22 @@ class CudnnNetwork : public Network { uint64_t* ipDataMasks = io->input_masks_mem_gpu_; float* ipDataValues = io->input_val_mem_gpu_; - if (std::is_same::value) { - expandPlanes_Fp16_NHWC((half*)(tensor_mem_[0]), ipDataMasks, ipDataValues, - batchSize * kInputPlanes); + bool fp16 = std::is_same::value; + if (fp16) { + if (nhwc_) + expandPlanes_Fp16_NHWC((half*)(tensor_mem_[0]), ipDataMasks, + ipDataValues, batchSize * kInputPlanes); + else + expandPlanes_Fp16_NCHW((half*)(tensor_mem_[0]), ipDataMasks, + ipDataValues, batchSize * kInputPlanes); } else { expandPlanes_Fp32_NCHW((float*)(tensor_mem_[0]), ipDataMasks, ipDataValues, batchSize * kInputPlanes); } + // debug code example + // dumpTensor(tensor_mem_[0], 512, "After expand Planes", fp16); + float* opPol = io->op_policy_mem_gpu_; float* opVal = io->op_value_mem_gpu_; @@ -466,7 +527,7 @@ class CudnnNetwork : public Network { network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_); // pol FC - if (std::is_same::value) { + if (fp16) { // TODO: consider softmax layer that writes directly to fp32 network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr, scratch_mem_, scratch_size_, cudnn_, @@ -485,7 +546,7 @@ class CudnnNetwork : public Network { network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_); // pol FC - if (std::is_same::value) { + if (fp16) { // TODO: consider softmax layer that writes directly to fp32. network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr, scratch_mem_, scratch_size_, cudnn_, @@ -519,7 +580,7 @@ class CudnnNetwork : public Network { cublas_); // value FC2 // VALUE // Value softmax - if (std::is_same::value) { + if (fp16) { // TODO: consider fusing the bias-add of FC2 with format conversion. network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr, scratch_mem_, scratch_size_, cudnn_, @@ -532,7 +593,7 @@ class CudnnNetwork : public Network { cublas_); // value FC2 // VALUE } } else { - if (std::is_same::value) { + if (fp16) { // TODO: consider fusing the bias-add of FC2 with format conversion. network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[1], nullptr, scratch_mem_, scratch_size_, cudnn_, @@ -617,6 +678,9 @@ class CudnnNetwork : public Network { int max_batch_size_; bool wdl_; + bool nhwc_; // do we want to use nhwc layout (fastest with fp16 with tensor + // cores) + // Currently only one NN Eval can happen a time (we can fix this if needed // by allocating more memory). mutable std::mutex lock_;