microsoft · jeffra · Aug 30, 2022 · Aug 8, 2022 · Aug 13, 2022 · Aug 15, 2022
@@ -108,3 +108,90 @@ template void launch_dequantize<__half>(__half*,
                                         unsigned,
                                         unsigned,
                                         cudaStream_t);
+
+__global__ void dequantize_kernel(float* output,
+                                  const int8_t* input,
+                                  const float* qscale,
+                                  int hidden_dim,
+                                  unsigned merge_hidden,
+                                  int cnt)
+{
+}
+
+__global__ void dequantize_kernel(__half* output,
+                                  const int8_t* input,
+                                  const float* qscale,
+                                  unsigned hidden_dim,
+                                  unsigned merge_hidden,
+                                  int cnt)
+{
+    unsigned bid = blockIdx.x * gridDim.y + blockIdx.y;
+    unsigned tid = threadIdx.x;
+
+    float local_scale = qscale[blockIdx.x];
+
+    const float* input_cast = reinterpret_cast<const float*>(input);
+    float2* output_cast = reinterpret_cast<float2*>(output);
+
+    input_cast += bid * merge_hidden;
+    output_cast += bid * merge_hidden;
+
+    for (int c = 0; c < cnt; c++) {
+        if (tid < merge_hidden) {
+            float q = input_cast[tid];
+            int8_t* q_int8 = (int8_t*)&q;
+
+            float2 q_f;
+            __half* q_h = (__half*)&q_f;
+
+            q_h[0] = __float2half(local_scale * (float)q_int8[0]);
+            q_h[1] = __float2half(local_scale * (float)q_int8[1]);
+            q_h[2] = __float2half(local_scale * (float)q_int8[2]);
+            q_h[3] = __float2half(local_scale * (float)q_int8[3]);
+            // q_h[4] = __float2half(local_scale * (float)q_int8[4]);
+            // q_h[5] = __float2half(local_scale * (float)q_int8[5]);
+            // q_h[6] = __float2half(local_scale * (float)q_int8[6]);
+            // q_h[7] = __float2half(local_scale * (float)q_int8[7]);
+            output_cast[tid] = q_f;
+            tid += blockDim.x;
+        }
+    }
+}
+
+template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       cudaStream_t stream)
+{
+    unsigned threads = 1024;
+    hidden_dim /= 4;
+    unsigned hid_cnt = threads / hidden_dim;
+    unsigned thd_cnt = (hidden_dim - 1) / threads + 1;
+    hid_cnt = hid_cnt > 0 ? hid_cnt : 1;
+
+    unsigned blocks = output_size / hid_cnt / groups;
+    dim3 block_dims(threads);
+    dim3 grid_dims(groups, blocks);
+
+    dequantize_kernel<<<grid_dims, block_dims, 0, stream>>>(
+        output, input, qscale, hidden_dim, hid_cnt * hidden_dim, thd_cnt);
+}
+
+template void launch_dequantize<float>(float*,
+                                       const int8_t*,
+                                       const float*,
+                                       unsigned,
+                                       unsigned,
+                                       unsigned,
+                                       cudaStream_t);
+template void launch_dequantize<__half>(__half*,
+                                        const int8_t*,
+                                        const float*,
+                                        unsigned,
+                                        unsigned,
+                                        unsigned,
+                                        cudaStream_t);
@@ -1,5 +1,6 @@
 #include "custom_cuda_layers.h"
 
+namespace cg = cooperative_groups;
 #define MAX_CAP 4
 #define MAX_SEQ 2048