diff --git a/lc0@exe/lc0@exe.log b/lc0@exe/lc0@exe.log
new file mode 100644
index 0000000000..8a4dd03e76
--- /dev/null
+++ b/lc0@exe/lc0@exe.log
@@ -0,0 +1,2 @@
+﻿  layers.cc
+  lc0@exe.vcxproj -> C:\Ankan\git\ankan\lc0\build\.\lc0.exe
diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu
index da986c69a3..7415423a32 100644
--- a/src/neural/cuda/common_kernels.cu
+++ b/src/neural/cuda/common_kernels.cu
@@ -25,6 +25,7 @@
   Program grant you additional permission to convey the resulting work.
 */
 
+#include <cassert>
 #include "cuda_common.h"
 #include "neural/network.h"
 
@@ -209,10 +210,10 @@ __global__ void expandPlanes_kernel_Fp32_NCHW(float* output,
                                               const uint64_t* masks,
                                               const float* values, int n) {
   // Block size of 256, same mask/val for 64 consecutive threads.
-  constexpr int kNumShmemElments = 256 / 64;
+  constexpr int kNumShmemElements = 256 / 64;
 
-  __shared__ uint64_t shMasks[kNumShmemElments];
-  __shared__ float shVals[kNumShmemElments];
+  __shared__ uint64_t shMasks[kNumShmemElements];
+  __shared__ float shVals[kNumShmemElements];
 
   int index = threadIdx.x + blockDim.x * blockIdx.x;
 
@@ -221,7 +222,7 @@ __global__ void expandPlanes_kernel_Fp32_NCHW(float* output,
   if (planeIndex >= n) return;
 
   // Load inputs to shared memory.
-  if (threadIdx.x < kNumShmemElments) {
+  if (threadIdx.x < kNumShmemElements) {
     shMasks[threadIdx.x] = masks[planeIndex + threadIdx.x];
     shVals[threadIdx.x] = values[planeIndex + threadIdx.x];
   }
@@ -281,10 +282,54 @@ void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks,
   ReportCUDAErrors(cudaGetLastError());
 }
 
-__global__ void globalScale_kernel(float* output, const float* input,
-                                   const float* scaleBias,
-                                   const float* prevLayerBias, int inputSize,
-                                   int C) {
+__global__ void expandPlanes_kernel_Fp16_NCHW(half* output,
+                                              const uint64_t* masks,
+                                              const float* values, int n) {
+  // block size of 256, same mask/val for 64 consecutive threads
+  constexpr int kNumShmemElements = 256 / 64;
+
+  __shared__ uint64_t shMasks[kNumShmemElements];
+  __shared__ half shVals[kNumShmemElements];
+
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+
+  int planeIndex = index >> 6;
+
+  if (planeIndex >= n) return;
+
+  // load inputs to shared memory
+  if (threadIdx.x < kNumShmemElements) {
+    shMasks[threadIdx.x] = masks[planeIndex + threadIdx.x];
+    shVals[threadIdx.x] = values[planeIndex + threadIdx.x];
+  }
+  __syncthreads();
+
+  uint64_t mask = shMasks[threadIdx.x >> 6];
+
+  int sqIndex = index & 0x3F;
+  half op = 0;
+
+  bool set = !!(mask & (1ull << sqIndex));
+  if (set) {
+    op = (half)shVals[threadIdx.x >> 6];
+  }
+  output[index] = op;
+}
+
+void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks,
+                            const float* values, int n) {
+  int threads = n * 8 * 8;  // each thread writes a single element
+  const int blockSize = 256;
+  int blocks = DivUp(threads, blockSize);
+  expandPlanes_kernel_Fp16_NCHW<<<blocks, blockSize>>>(output, masks, values,
+                                                       n);
+  ReportCUDAErrors(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void globalScale_kernel(T* output, const T* input,
+                                   const T* scaleBias, const T* prevLayerBias,
+                                   int inputSize, int C) {
   const int kPlaneSize = 64;
 
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -299,7 +344,7 @@ __global__ void globalScale_kernel(float* output, const float* input,
   float val2 = output[tid];  // Skip connection to be added directly.
 
   if (prevLayerBias) {
-    val1 += prevLayerBias[c];
+    val1 += (float)(prevLayerBias[c]);
   }
 
   int startIdx = n * 2 * C;  // Scale and bias interleaved.
@@ -311,7 +356,7 @@ __global__ void globalScale_kernel(float* output, const float* input,
 
   float op = val1 * s + val2 + b;
   if (op < 0) op = 0;
-  output[tid] = op;
+  output[tid] = (T)op;
 }
 
 __global__ void globalScale_kernel_fp16_nhwc(half* output, const half* input,
@@ -374,8 +419,9 @@ __global__ void globalAvgPool_kernel_NHWC_fp16(half* output, const half* input,
 }
 
 // Each thread reads 2 inputs (8x8/32), and each warp writes a single output.
-__global__ void globalAvgPool_kernel(float* output, const float* input,
-                                     const float* prevLayerBias, int inputSize,
+template <typename T>
+__global__ void globalAvgPool_kernel(T* output, const T* input,
+                                     const T* prevLayerBias, int inputSize,
                                      int outputSize, int C) {
   const int elementsPerWarp = 64;
   const int elementsPerThread = 2;
@@ -391,7 +437,7 @@ __global__ void globalAvgPool_kernel(float* output, const float* input,
 #pragma unroll
   for (int i = 0; i < elementsPerWarp; i += 32) {
     int index = laneStartIndex + laneId + i;
-    if (index < inputSize) S += input[index];
+    if (index < inputSize) S += (float)(input[index]);
   }
 
 // Compute warp wide sum (for entire plane - elementsPerWarp elements).
@@ -406,19 +452,20 @@ __global__ void globalAvgPool_kernel(float* output, const float* input,
   // First thread in warp has the sum, write it in output.
   if (laneId == 0) {
     if (opIndex < outputSize) {
-      if (prevLayerBias) avg += prevLayerBias[opIndex % C];
-      output[opIndex] = avg;
+      if (prevLayerBias) avg += (float)prevLayerBias[opIndex % C];
+      output[opIndex] = (T)avg;
     }
   }
 }
 
 template <typename T>
 void globalAvgPool(int N, int C, T* output, const T* input,
-                   const T* prevLayerBias) {
+                   const T* prevLayerBias, bool nhwc) {
   const int kPlaneSize = 64;
 
   const bool fp16 = std::is_same<half, T>::value;
-  if (fp16) {
+  if (nhwc) {
+    assert(fp16);
     // For NHWC fp16, simply launch N blocks, each with C threads.
     globalAvgPool_kernel_NHWC_fp16<<<N, C>>>((half*)output, (half*)input,
                                              (half*)prevLayerBias,
@@ -433,8 +480,7 @@ void globalAvgPool(int N, int C, T* output, const T* input,
     const int kBlockSize = kWarpsPerBlock * 32;
 
     int blocks = DivUp(kTotalWarps, kWarpsPerBlock);
-    globalAvgPool_kernel<<<blocks, kBlockSize>>>((float*)output, (float*)input,
-                                                 (float*)prevLayerBias,
+    globalAvgPool_kernel<<<blocks, kBlockSize>>>(output, input, prevLayerBias,
                                                  N * C * kPlaneSize, N * C, C);
   }
   ReportCUDAErrors(cudaGetLastError());
@@ -442,21 +488,21 @@ void globalAvgPool(int N, int C, T* output, const T* input,
 
 template <typename T>
 void globalScale(int N, int C, T* output, const T* input, const T* scaleBias,
-                 const T* prevLayerBias) {
+                 const T* prevLayerBias, bool nhwc) {
   const bool fp16 = std::is_same<half, T>::value;
 
   // Each thread writes one output.
   const int kBlockSize = 256;
   const int kBlocks = DivUp(N * 8 * 8 * C, kBlockSize);
 
-  if (fp16) {
+  if (nhwc) {
+    assert(fp16);
     globalScale_kernel_fp16_nhwc<<<kBlocks, kBlockSize>>>(
         (half*)output, (half*)input, (half*)scaleBias, (half*)prevLayerBias,
         N * C * 8 * 8, C, 8 * 8 * C);
   } else {
     globalScale_kernel<<<kBlocks, kBlockSize>>>(
-        (float*)output, (float*)input, (float*)scaleBias, (float*)prevLayerBias,
-        N * C * 8 * 8, C);
+        output, input, scaleBias, prevLayerBias, N * C * 8 * 8, C);
   }
   ReportCUDAErrors(cudaGetLastError());
 }
@@ -520,16 +566,16 @@ template void addBias_NCHW<half>(half* c, half* a, half* b, int N, int C, int H,
 
 template void globalAvgPool<float>(int N, int C, float* output,
                                    const float* input,
-                                   const float* prevLayerBias);
+                                   const float* prevLayerBias, bool nhwc);
 template void globalAvgPool<half>(int N, int C, half* output, const half* input,
-                                  const half* prevLayerBias);
+                                  const half* prevLayerBias, bool nhwc);
 
 template void globalScale<float>(int N, int C, float* output,
                                  const float* input, const float* scaleBias,
-                                 const float* prevLayerBias);
+                                 const float* prevLayerBias, bool nhwc);
 template void globalScale<half>(int N, int C, half* output, const half* input,
                                 const half* scaleBias,
-                                const half* prevLayerBias);
+                                const half* prevLayerBias, bool nhwc);
 
 template void PolicyMap<float>(int N, float* output, const float* input,
                                const short* indices, int inputSize,
diff --git a/src/neural/cuda/kernels.h b/src/neural/cuda/kernels.h
index d98936d9ec..6fce88e31f 100644
--- a/src/neural/cuda/kernels.h
+++ b/src/neural/cuda/kernels.h
@@ -59,15 +59,18 @@ void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks,
 void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks,
                             const float* values, int n);
 
+void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks,
+                            const float* values, int n);
+
 // Perform global avg pool.
 template <typename T>
 void globalAvgPool(int N, int C, T* output, const T* input,
-                   const T* prevLayerBias);
+                   const T* prevLayerBias, bool nhwc);
 
 // Perform global scale.
 template <typename T>
 void globalScale(int N, int C, T* output, const T* input, const T* scaleBias,
-                 const T* prevLayerBias);
+                 const T* prevLayerBias, bool nhwc);
 
 // Perform Squeeze-and-Excitation (SE) in a single fused kernel.
 // Returns false if the fused kernel can't handle the sizes.
diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
index 0678f068e1..a1f42b3151 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/cuda/layers.cc
@@ -24,23 +24,27 @@
   terms of the respective license agreement, the licensors of this
   Program grant you additional permission to convey the resulting work.
 */
+#include "layers.h"
 #include <cassert>
 #include <cstring>
 #include <vector>
 #include "cuda_common.h"
 #include "kernels.h"
-#include "layers.h"
 namespace lczero {
 namespace cudnn_backend {
 
 // Use Single kernel for entire SE operation.
-// Right now supported only for fp16 and it's quite a bit faster
+// Right now supported only for fp16 with nhwc and it's quite a bit faster
 // than using multiple passes. The flag can be set to false for debugging.
 static constexpr bool kUseFusedSELayer = true;
 
+template <typename DataType>
+BaseLayer<DataType>::BaseLayer(int c, int h, int w, BaseLayer* ip, bool nhwc)
+    : input_(ip), C(c), H(h), W(w), nhwc_(nhwc) {}
+
 template <typename DataType>
 BaseLayer<DataType>::BaseLayer(int c, int h, int w, BaseLayer* ip)
-    : input_(ip), C(c), H(h), W(w) {}
+    : input_(ip), C(c), H(h), W(w), nhwc_(ip->nhwc_) {}
 
 template <typename DataType>
 SoftMaxLayer<DataType>::SoftMaxLayer(BaseLayer<DataType>* ip)
@@ -49,8 +53,7 @@ SoftMaxLayer<DataType>::SoftMaxLayer(BaseLayer<DataType>* ip)
 }
 
 template <typename DataType>
-SoftMaxLayer<DataType>::~SoftMaxLayer()
-{
+SoftMaxLayer<DataType>::~SoftMaxLayer() {
   cudnnDestroyTensorDescriptor(out_tensor_desc_);
 }
 
@@ -62,14 +65,15 @@ void SoftMaxLayer<DataType>::Eval(int N, DataType* output,
                                   cublasHandle_t /*cublas*/) {
   float alpha = 1.0f, beta = 0.0f;
 
+  const cudnnDataType_t dataType =
+      std::is_same<half, DataType>::value ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT;
+
+  const cudnnTensorFormat_t layout =
+      nhwc_ ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW;
+
   // Need to call this at Eval as 'N' changes :-/
-  if (std::is_same<half, DataType>::value) {
-    cudnnSetTensor4dDescriptor(out_tensor_desc_, CUDNN_TENSOR_NHWC,
-                               CUDNN_DATA_HALF, N, GetC(), GetH(), GetW());
-  } else {
-    cudnnSetTensor4dDescriptor(out_tensor_desc_, CUDNN_TENSOR_NCHW,
-                               CUDNN_DATA_FLOAT, N, GetC(), GetH(), GetW());
-  }
+  cudnnSetTensor4dDescriptor(out_tensor_desc_, layout, dataType, N, GetC(),
+                             GetH(), GetW());
 
   cudnnSoftmaxForward(cudnn, CUDNN_SOFTMAX_ACCURATE,
                       CUDNN_SOFTMAX_MODE_INSTANCE, &alpha, out_tensor_desc_,
@@ -77,21 +81,21 @@ void SoftMaxLayer<DataType>::Eval(int N, DataType* output,
 }
 
 template <typename DataType>
-ConvLayer<DataType>::ConvLayer(BaseLayer<DataType>* ip, int C, int H, int W,
-                               int filter, int Cin, bool relu, bool bias)
-    : BaseLayer<DataType>(C, H, W, ip),
-      c_input_(Cin),
-      filter_size_(filter),
-      use_relu_(relu),
-      use_bias_(bias) {
+void ConvLayer<DataType>::init() {
   // Allocate memory for weights (filter tensor) and biases.
-  const size_t weight_size = sizeof(DataType) * Cin * C * filter_size_ * filter_size_;
+  const size_t weight_size =
+      sizeof(DataType) * c_input_ * C * filter_size_ * filter_size_;
   ReportCUDAErrors(cudaMalloc(&weights, weight_size));
 
   const size_t blas_size = sizeof(DataType) * C;
   ReportCUDAErrors(cudaMalloc(&biases, blas_size));
 
   const bool fp16 = std::is_same<half, DataType>::value;
+  const cudnnDataType_t dataType =
+      std::is_same<half, DataType>::value ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT;
+
+  const cudnnTensorFormat_t layout =
+      nhwc_ ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW;
 
   // Create cudnn objects for various tensors, algorithms, etc.
   cudnnCreateFilterDescriptor(&filter_desc_);
@@ -101,29 +105,25 @@ ConvLayer<DataType>::ConvLayer(BaseLayer<DataType>* ip, int C, int H, int W,
   cudnnCreateTensorDescriptor(&bias_desc_);
   cudnnCreateActivationDescriptor(&activation_);
 
-  cudnnSetFilter4dDescriptor(filter_desc_,
-                             fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT,
-                             fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW,
-                             GetC(), Cin, filter_size_, filter_size_);
+  cudnnSetFilter4dDescriptor(filter_desc_, dataType, layout, GetC(), c_input_,
+                             filter_size_, filter_size_);
 
-  ReportCUDNNErrors(cudnnSetTensor4dDescriptor(
-      bias_desc_, fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW,
-      fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT, 1, C, 1, 1));
+  ReportCUDNNErrors(
+      cudnnSetTensor4dDescriptor(bias_desc_, layout, dataType, 1, C, 1, 1));
 
   const int padding = filter_size_ / 2;
   const bool crossCorr = 1;
 
   ReportCUDNNErrors(cudnnSetConvolution2dDescriptor(
       conv_desc_, padding, padding, 1, 1, 1, 1,
-      crossCorr ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION,
-      fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT));
+      crossCorr ? CUDNN_CROSS_CORRELATION : CUDNN_CONVOLUTION, dataType));
 
-  if (fp16)
+  if (fp16 && nhwc_)
     ReportCUDNNErrors(
         cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
 
   // TODO: dynamic selection of algorithm!
-  if ((C > 32) && (!fp16) && (filter_size_ > 1)) {
+  if ((C > 32) && (!nhwc_) && (filter_size_ > 1)) {
     conv_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
   } else {
     conv_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
@@ -141,6 +141,28 @@ ConvLayer<DataType>::ConvLayer(BaseLayer<DataType>* ip, int C, int H, int W,
 #endif
 }
 
+template <typename DataType>
+ConvLayer<DataType>::ConvLayer(BaseLayer<DataType>* ip, int C, int H, int W,
+                               int filter, int Cin, bool relu, bool bias)
+    : BaseLayer<DataType>(C, H, W, ip),
+      c_input_(Cin),
+      filter_size_(filter),
+      use_relu_(relu),
+      use_bias_(bias) {
+  init();
+}
+
+template <typename DataType>
+ConvLayer<DataType>::ConvLayer(bool nhwc, int C, int H, int W, int filter,
+                               int Cin, bool relu, bool bias)
+    : BaseLayer<DataType>(C, H, W, nullptr, nhwc),
+      c_input_(Cin),
+      filter_size_(filter),
+      use_relu_(relu),
+      use_bias_(bias) {
+  init();
+}
+
 template <>
 void ConvLayer<half>::LoadWeights(float* pfilter, float* pBias, void* scratch) {
   const size_t weight_size =
@@ -152,8 +174,14 @@ void ConvLayer<half>::LoadWeights(float* pfilter, float* pBias, void* scratch) {
   assert(scratch);
   ReportCUDAErrors(
       cudaMemcpy(scratch, pfilter, weight_size, cudaMemcpyHostToDevice));
-  fp32NCHWtofp16NHWC((half*)weights, (float*)scratch, C, c_input_, C, c_input_,
-                     filter_size_, filter_size_);
+
+  if (nhwc_) {
+    fp32NCHWtofp16NHWC((half*)weights, (float*)scratch, C, c_input_, C,
+                       c_input_, filter_size_, filter_size_);
+  } else {
+    copyTypeConverted((half*)weights, (float*)scratch,
+                      C * c_input_ * filter_size_ * filter_size_);
+  }
 
   if (pBias) {
     ReportCUDAErrors(
@@ -185,15 +213,17 @@ void ConvLayer<DataType>::Eval(int N, DataType* output, const DataType* input,
                                const DataType* input2, void* scratch,
                                size_t scratch_size, cudnnHandle_t cudnn,
                                cublasHandle_t /*cublas*/) {
-  const bool fp16 = std::is_same<half, DataType>::value;
+  const cudnnDataType_t dataType =
+      std::is_same<half, DataType>::value ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT;
 
-  ReportCUDNNErrors(cudnnSetTensor4dDescriptor(
-      out_tensor_desc_, fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW,
-      fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT, N, C, H, W));
+  const cudnnTensorFormat_t layout =
+      nhwc_ ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW;
 
-  ReportCUDNNErrors(cudnnSetTensor4dDescriptor(
-      in_tensor_desc_, fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW,
-      fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT, N, c_input_, H, W));
+  ReportCUDNNErrors(cudnnSetTensor4dDescriptor(out_tensor_desc_, layout,
+                                               dataType, N, C, H, W));
+
+  ReportCUDNNErrors(cudnnSetTensor4dDescriptor(in_tensor_desc_, layout,
+                                               dataType, N, c_input_, H, W));
 
   float alpha = 1.0f, beta = 0.0f;
 
@@ -211,10 +241,10 @@ void ConvLayer<DataType>::Eval(int N, DataType* output, const DataType* input,
         conv_desc_, conv_algo_, scratch, scratch_size, &alpha, out_tensor_desc_,
         input2, bias_desc_, biases, activation_, out_tensor_desc_, output));
   } else {
-    // For some reason cudnn doesn't support just Convolution + Bias with fp32
+    // For some reason cudnn doesn't support just Convolution + Bias with nchw
     // (winograd algorithm) it works fine when RELU is also needed which is
     // somewhat strange.
-    if ((std::is_same<float, DataType>::value) && (!use_relu_)) {
+    if ((!nhwc_) && (!use_relu_)) {
       ReportCUDNNErrors(cudnnConvolutionForward(
           cudnn, &alpha, in_tensor_desc_, input, filter_desc_, weights,
           conv_desc_, conv_algo_, scratch, scratch_size, &beta,
@@ -265,47 +295,6 @@ ConvLayer<DataType>::~ConvLayer() {
   cudnnDestroyActivationDescriptor(activation_);
 }
 
-template <typename DataType>
-BNLayer<DataType>::BNLayer(BaseLayer<DataType>* ip, bool relu)
-    : BaseLayer<DataType>(ip->GetC(), ip->GetH(), ip->GetW(), ip),
-      use_relu_(relu) {
-    const size_t weight_size = sizeof(float) * C;
-
-  ReportCUDAErrors(cudaMalloc(&means_, weight_size));
-  ReportCUDAErrors(cudaMalloc(&variances_, weight_size));
-}
-
-template <typename DataType>
-void BNLayer<DataType>::LoadWeights(float* cpuMeans, float* cpuVar) {
-  const size_t weight_size = sizeof(float) * C;
-  ReportCUDAErrors(
-      cudaMemcpy(means_, cpuMeans, weight_size, cudaMemcpyHostToDevice));
-  ReportCUDAErrors(
-      cudaMemcpy(variances_, cpuVar, weight_size, cudaMemcpyHostToDevice));
-}
-
-template <>
-void BNLayer<half>::Eval(int N, half* output, const half* input,
-                         const half* input2, void* /*scratch*/,
-                         size_t /*scratch_size*/, cudnnHandle_t /*cudnn*/,
-                         cublasHandle_t /*cublas*/) {
-  batchNorm(output, input, input2, N, C, H, W, means_, variances_, use_relu_);
-}
-
-template <>
-void BNLayer<float>::Eval(int N, float* output, const float* input,
-                          const float* input2, void* /*scratch*/,
-                          size_t /*scratch_size*/, cudnnHandle_t /*cudnn*/,
-                          cublasHandle_t /*cublas*/) {
-  batchNorm(output, input, input2, N, C, H, W, means_, variances_, use_relu_);
-}
-
-template <typename DataType>
-BNLayer<DataType>::~BNLayer() {
-  ReportCUDAErrors(cudaFree(means_));
-  ReportCUDAErrors(cudaFree(variances_));
-}
-
 template <typename DataType>
 SELayer<DataType>::SELayer(BaseLayer<DataType>* ip, int fc1Outputs,
                            bool addPrevLayerBias)
@@ -377,7 +366,7 @@ void SELayer<half>::LoadWeights(float* w1, float* b1, float* w2, float* b2,
   std::vector<float> temp(weight_size2);
 
   // Weight for the first FC layer.
-  if (kUseFusedSELayer) {
+  if (kUseFusedSELayer && nhwc_) {
     cpuTranspose(temp.data(), w1, numFc1Out_, C);
     ReportCUDAErrors(
         cudaMemcpy(scratch, temp.data(), weight_size1, cudaMemcpyHostToDevice));
@@ -388,7 +377,7 @@ void SELayer<half>::LoadWeights(float* w1, float* b1, float* w2, float* b2,
   copyTypeConverted((half*)w1_, (float*)scratch, num_weights1);
 
   // Weight for the second FC layer.
-  if (kUseFusedSELayer) {
+  if (kUseFusedSELayer && nhwc_) {
     cpuTranspose(temp.data(), w2, 2 * C, numFc1Out_);
     ReportCUDAErrors(
         cudaMemcpy(scratch, temp.data(), weight_size2, cudaMemcpyHostToDevice));
@@ -427,7 +416,7 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
 
   // 1. Global avg pooling (also adds previous layer bias before computing
   // averages).
-  globalAvgPool(N, C, op2, input, bPrev_);
+  globalAvgPool(N, C, op2, input, bPrev_, false);
 
   // 2. First fully connected layer.
   float alpha = 1.0f, beta = 0.0f;
@@ -445,7 +434,7 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
 
   // 4. (Optional prev layer bias add), Global scale, residual add, relu and
   // bias.
-  globalScale(N, C, output, input, op2, bPrev_);
+  globalScale(N, C, output, input, op2, bPrev_, false);
 }
 
 template <>
@@ -453,10 +442,10 @@ void SELayer<half>::Eval(int N, half* output, const half* input,
                          const half* input2, void* scratch, size_t scratch_size,
                          cudnnHandle_t /*cudnn*/, cublasHandle_t cublas) {
   bool se_done = false;
-  if (kUseFusedSELayer) {
+  if (kUseFusedSELayer && nhwc_) {
     se_done = Se_Fp16_NHWC(N, C, numFc1Out_, output, input2, input, w1_, b1_,
                            w2_, b2_, bPrev_);
-  } 
+  }
   if (!se_done) {
     assert(output == input2);
     // Ping-pong between 'op1' and 'op2' (parts of scratch memory).
@@ -465,7 +454,7 @@ void SELayer<half>::Eval(int N, half* output, const half* input,
 
     // 1. Global avg pooling (also adds previous layer bias before computing
     // averages).
-    globalAvgPool(N, C, op2, input, bPrev_);
+    globalAvgPool(N, C, op2, input, bPrev_, nhwc_);
 
     // 2. First fully connected layer.
     __half_raw one_h{0x3C00};
@@ -486,7 +475,7 @@ void SELayer<half>::Eval(int N, half* output, const half* input,
 
     // 4. (Optional prev layer bias add), Global scale, residual add, relu and
     // bias.
-    globalScale(N, C, output, input, op2, bPrev_);
+    globalScale(N, C, output, input, op2, bPrev_, nhwc_);
   }
 }
 
@@ -498,9 +487,9 @@ FCLayer<DataType>::FCLayer(BaseLayer<DataType>* ip, int C, int H, int W,
       use_relu_(relu),
       use_tanh_(tanh),
       use_sigmoid_(sigmoid) {
-    const size_t weight_size =
+  const size_t weight_size =
       sizeof(DataType) * C * H * W * ip->GetC() * ip->GetH() * ip->GetW();
-    const size_t blas_size = sizeof(DataType) * C * H * W;
+  const size_t blas_size = sizeof(DataType) * C * H * W;
   ReportCUDAErrors(cudaMalloc(&weights_, weight_size));
   if (use_bias_) {
     ReportCUDAErrors(cudaMalloc(&biases_, blas_size));
@@ -523,9 +512,13 @@ void FCLayer<half>::LoadWeights(float* cpuWeight, float* cpuBias,
   ReportCUDAErrors(
       cudaMemcpy(scratch, cpuWeight, weight_size, cudaMemcpyHostToDevice));
 
-  fp32NCHWtofp16NHWC((half*)weights_, (float*)scratch, num_biases,
-                     input_->GetC(), num_biases, input_->GetC(), input_->GetH(),
-                     input_->GetW());
+  if (nhwc_) {
+    fp32NCHWtofp16NHWC((half*)weights_, (float*)scratch, num_biases,
+                       input_->GetC(), num_biases, input_->GetC(),
+                       input_->GetH(), input_->GetW());
+  } else {
+    copyTypeConverted((half*)weights_, (float*)scratch, num_weights);
+  }
 
   if (cpuBias) {
     ReportCUDAErrors(
@@ -616,7 +609,7 @@ void PolicyMapLayer<DataType>::LoadWeights(const short* cpuWeight,
                                            void* /*scratch*/) {
   size_t weight_size = sizeof(short) * used_size_;
 
-  if (std::is_same<half, DataType>::value) {
+  if (nhwc_) {
     // convert CHW to HWC
     int C = used_size_ / 64;
     int Cin = this->input_->GetC();
@@ -712,9 +705,6 @@ template class ConvLayer<float>;
 template class FCLayer<half>;
 template class FCLayer<float>;
 
-template class BNLayer<half>;
-template class BNLayer<float>;
-
 template class SoftMaxLayer<half>;
 template class SoftMaxLayer<float>;
 
diff --git a/src/neural/cuda/layers.h b/src/neural/cuda/layers.h
index a470c23daa..a4d02a748e 100644
--- a/src/neural/cuda/layers.h
+++ b/src/neural/cuda/layers.h
@@ -44,6 +44,7 @@ class BaseLayer {
   int GetW() const { return W; }
 
   BaseLayer(int c, int h, int w, BaseLayer* ip);
+  BaseLayer(int c, int h, int w, BaseLayer* ip, bool nhwc);
   virtual ~BaseLayer() = default;
   size_t GetOutputSize(int N) const { return sizeof(DataType) * N * C * H * W; }
 
@@ -58,6 +59,8 @@ class BaseLayer {
   int C;  // Output tensor dimensions.
   int H;
   int W;
+
+  bool nhwc_;   // tensor layout
 };
 
 template <typename DataType>
@@ -68,10 +71,15 @@ class ConvLayer : public BaseLayer<DataType> {
   using BaseLayer<DataType>::GetC;
   using BaseLayer<DataType>::GetH;
   using BaseLayer<DataType>::GetW;
+  using BaseLayer<DataType>::nhwc_;
 
  public:
   ConvLayer(BaseLayer<DataType>* ip, int C, int H, int W, int size, int Cin,
             bool relu = false, bool bias = false);
+  
+  ConvLayer(bool nhwc, int C, int H, int W, int size, int Cin,
+            bool relu = false, bool bias = false);
+
   ~ConvLayer();
   void LoadWeights(float* pfilter, float* pBias, void* scratch);
   void Eval(int N, DataType* output, const DataType* input,
@@ -95,6 +103,8 @@ class ConvLayer : public BaseLayer<DataType> {
   cudnnTensorDescriptor_t in_tensor_desc_;
   cudnnTensorDescriptor_t out_tensor_desc_;
   cudnnActivationDescriptor_t activation_;
+
+  void init();
 };
 
 template <typename DataType>
@@ -102,6 +112,7 @@ class SoftMaxLayer : public BaseLayer<DataType> {
   using BaseLayer<DataType>::GetC;
   using BaseLayer<DataType>::GetH;
   using BaseLayer<DataType>::GetW;
+  using BaseLayer<DataType>::nhwc_;
 
  public:
   SoftMaxLayer(BaseLayer<DataType>* ip);
@@ -114,30 +125,10 @@ class SoftMaxLayer : public BaseLayer<DataType> {
   cudnnTensorDescriptor_t out_tensor_desc_;
 };
 
-template <typename DataType>
-class BNLayer : public BaseLayer<DataType> {
-  using BaseLayer<DataType>::C;
-
- public:
-  BNLayer(BaseLayer<DataType>* ip, bool relu);
-  ~BNLayer();
-
-  void LoadWeights(float* cpuMeans, float* cpuVar);
-  void Eval(int N, DataType* output, const DataType* input,
-            const DataType* input2, void* scratch, size_t scratch_size,
-            cudnnHandle_t cudnn, cublasHandle_t cublas) override;
-
- private:
-  const bool use_relu_;
-
-  // Weights for BN layer are always in float irrespective of DataType
-  // as there is not much point in converting these to fp16.
-  float* means_ = nullptr;
-  float* variances_ = nullptr;
-};
-
 template <typename DataType>
 class FCLayer : public BaseLayer<DataType> {
+ using BaseLayer<DataType>::nhwc_;
+
  public:
   FCLayer(BaseLayer<DataType>* ip, int C, int H, int W, bool relu, bool bias,
           bool tanh = false, bool sigmoid = false);
@@ -159,6 +150,8 @@ class FCLayer : public BaseLayer<DataType> {
 
 template <typename DataType>
 class PolicyMapLayer: public BaseLayer<DataType> {
+ using BaseLayer<DataType>::nhwc_;
+
  public:
   PolicyMapLayer(BaseLayer<DataType>* ip, int C, int H, int W, int usedSize);
   ~PolicyMapLayer();
@@ -181,6 +174,7 @@ class PolicyMapLayer: public BaseLayer<DataType> {
 template <typename DataType>
 class SELayer : public BaseLayer<DataType> {
  using BaseLayer<DataType>::C;
+ using BaseLayer<DataType>::nhwc_;
 
  public:
   SELayer(BaseLayer<DataType>* ip, int numFc1Out,
diff --git a/src/neural/cuda/network_cudnn.cc b/src/neural/cuda/network_cudnn.cc
index 1bfb4c633b..d41cd50d75 100644
--- a/src/neural/cuda/network_cudnn.cc
+++ b/src/neural/cuda/network_cudnn.cc
@@ -46,6 +46,36 @@ using namespace cudnn_backend;
 
 static constexpr int kNumOutputPolicy = 1858;
 
+#if 0
+// debug code to dump allocation in GPU memory
+void dumpTensor(void *memory, int elements, char *message, bool fp16 = false)
+{
+    printf("\n%s\n", message);
+    int elementSize = (int) (fp16 ? sizeof(half) : sizeof(float));
+    int bytes = elements * elementSize;
+    void *temp = malloc(bytes);
+    cudaMemcpy(temp, memory, bytes, cudaMemcpyDeviceToHost);
+
+    for (int i = 0; i < elements; i++)
+    {
+        float val;
+        if (fp16) 
+        {
+            half *arr = (half*)temp;
+            val = (float)arr[i];
+        }
+        else
+        {
+            float *arr = (float *)temp;
+            val = arr[i];
+        }
+        printf("%10.4f ", val);
+    }
+    free(temp);
+    printf("\n");
+}
+#endif
+
 struct InputsOutputs {
   InputsOutputs(int maxBatchSize) {
     ReportCUDAErrors(cudaHostAlloc(
@@ -184,14 +214,35 @@ class CudnnNetwork : public Network {
     ReportCUDNNErrors(cudnnCreate(&cudnn_));
     ReportCUBLASErrors(cublasCreate(&cublas_));
 
+    // Default layout is nchw.
+    nhwc_ = false;
+
     if (std::is_same<half, DataType>::value) {
-      // Check if the GPU support fp16 (Volta+).
-      if (deviceProp.major >= 7) {
-        // Enable Tensor cores!
-        ReportCUBLASErrors(cublasSetMathMode(cublas_, CUBLAS_TENSOR_OP_MATH));
+      // Check if the GPU support FP16.
+
+      if (deviceProp.major == 6 && deviceProp.minor == 0) {
+        // FP16 without tensor cores supported on GP100 (SM 6.0)
+        // nhwc_ remains false.
+      } else if (deviceProp.major >= 7) {
+        // NHWC layout is faster with Tensor Cores.
+        // Supported on Volta and Turing (and hopefully future GPUs too).
+
+        // Some GPUs (GTX 16xx) are SM 7.5 but don't have tensor cores
+        // enabling TENSOR_OP_MATH or nhwc_ layout for them works but is
+        // very very slow (likely because the system emulates it).
+        if (!strstr(deviceProp.name, "GTX 16")) {
+          nhwc_ = true;
+        }
       } else {
         throw Exception("Your GPU doesn't support FP16");
       }
+
+      // Override if forced from backend option
+      if (!options.IsDefault<bool>("nhwc")) 
+          nhwc_ = options.Get<bool>("nhwc");
+
+      if (nhwc_)
+        ReportCUBLASErrors(cublasSetMathMode(cublas_, CUBLAS_TENSOR_OP_MATH));
     }
 
     const int kNumInputPlanes = kInputPlanes;
@@ -218,24 +269,26 @@ class CudnnNetwork : public Network {
 
     const int maxChannels = std::max(kInputPlanes, kNumFilters);
 
-    const bool fp16 = std::is_same<half, DataType>::value;
+    const cudnnDataType_t datatype = std::is_same<half, DataType>::value
+                                         ? CUDNN_DATA_HALF
+                                         : CUDNN_DATA_FLOAT;
+    const cudnnTensorFormat_t layout =
+        nhwc_ ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW;
+
     ReportCUDNNErrors(cudnnSetFilter4dDescriptor(
-        wDesc, fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT,
-        fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW, maxChannels, maxChannels,
-        3, 3));
+        wDesc, datatype, layout, maxChannels, maxChannels, 3, 3));
 
     ReportCUDNNErrors(cudnnSetTensor4dDescriptor(
-        xDesc, fp16 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW,
-        fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT, max_batch_size_, maxChannels,
-        8, 8));
+        xDesc, layout, datatype, max_batch_size_, maxChannels, 8, 8));
 
     ReportCUDNNErrors(cudnnSetConvolution2dDescriptor(
-        convDesc, 1, 1, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION,
-        fp16 ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT));
+        convDesc, 1, 1, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION, datatype));
 
-    if (fp16) {
-      ReportCUDNNErrors(
-          cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH));
+    // It will fall back to non-tensor math if not supported.
+    ReportCUDNNErrors(
+        cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH));
+
+    if (nhwc_) {
       conv_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else {
       conv_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
@@ -259,7 +312,7 @@ class CudnnNetwork : public Network {
     // Input.
     {
       auto inputConv = std::make_unique<ConvLayer<DataType>>(
-          nullptr, kNumFilters, 8, 8, 3, kNumInputPlanes, true, true);
+          nhwc_, kNumFilters, 8, 8, 3, kNumInputPlanes, true, true);
       inputConv->LoadWeights(&weights.input.weights[0],
                              &weights.input.biases[0], scratch_mem_);
       network_.emplace_back(std::move(inputConv));
@@ -411,14 +464,22 @@ class CudnnNetwork : public Network {
     uint64_t* ipDataMasks = io->input_masks_mem_gpu_;
     float* ipDataValues = io->input_val_mem_gpu_;
 
-    if (std::is_same<half, DataType>::value) {
-      expandPlanes_Fp16_NHWC((half*)(tensor_mem_[0]), ipDataMasks, ipDataValues,
-                             batchSize * kInputPlanes);
+    bool fp16 = std::is_same<half, DataType>::value;
+    if (fp16) {
+      if (nhwc_)
+        expandPlanes_Fp16_NHWC((half*)(tensor_mem_[0]), ipDataMasks,
+                               ipDataValues, batchSize * kInputPlanes);
+      else
+        expandPlanes_Fp16_NCHW((half*)(tensor_mem_[0]), ipDataMasks,
+                               ipDataValues, batchSize * kInputPlanes);
     } else {
       expandPlanes_Fp32_NCHW((float*)(tensor_mem_[0]), ipDataMasks,
                              ipDataValues, batchSize * kInputPlanes);
     }
 
+    // debug code example
+    // dumpTensor(tensor_mem_[0], 512, "After expand Planes", fp16);
+
     float* opPol = io->op_policy_mem_gpu_;
     float* opVal = io->op_value_mem_gpu_;
 
@@ -466,7 +527,7 @@ class CudnnNetwork : public Network {
       network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr,
                           scratch_mem_, scratch_size_, cudnn_,
                           cublas_);  // pol FC
-      if (std::is_same<half, DataType>::value) {
+      if (fp16) {
         // TODO: consider softmax layer that writes directly to fp32
         network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
                             scratch_mem_, scratch_size_, cudnn_,
@@ -485,7 +546,7 @@ class CudnnNetwork : public Network {
       network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
                           scratch_mem_, scratch_size_, cudnn_,
                           cublas_);  // pol FC
-      if (std::is_same<half, DataType>::value) {
+      if (fp16) {
         // TODO: consider softmax layer that writes directly to fp32.
         network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr,
                             scratch_mem_, scratch_size_, cudnn_,
@@ -519,7 +580,7 @@ class CudnnNetwork : public Network {
                           cublas_);  // value FC2    // VALUE
 
       // Value softmax
-      if (std::is_same<half, DataType>::value) {
+      if (fp16) {
         // TODO: consider fusing the bias-add of FC2 with format conversion.
         network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                             scratch_mem_, scratch_size_, cudnn_,
@@ -532,7 +593,7 @@ class CudnnNetwork : public Network {
                             cublas_);  // value FC2    // VALUE
       }
     } else {
-      if (std::is_same<half, DataType>::value) {
+      if (fp16) {
         // TODO: consider fusing the bias-add of FC2 with format conversion.
         network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[1], nullptr,
                             scratch_mem_, scratch_size_, cudnn_,
@@ -617,6 +678,9 @@ class CudnnNetwork : public Network {
   int max_batch_size_;
   bool wdl_;
 
+  bool nhwc_;  // do we want to use nhwc layout (fastest with fp16 with tensor
+               // cores)
+
   // Currently only one NN Eval can happen a time (we can fix this if needed
   // by allocating more memory).
   mutable std::mutex lock_;