Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/gg/flash-attn' into flash-deco…
Browse files Browse the repository at this point in the history
…ding
  • Loading branch information
FSSRepo committed Mar 6, 2024
2 parents c42f7cb + 8ad92dc commit 3ac0fd3
Show file tree
Hide file tree
Showing 7 changed files with 1,426 additions and 42 deletions.
195 changes: 194 additions & 1 deletion ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6863,8 +6863,143 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
}

<<<<<<< HEAD
template <bool vals_smem, int ncols_template, int block_size_template>
static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
=======
template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
static __global__ void soft_max_f16(const float * x, const half * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;

const int tid = threadIdx.x;
const int rowx = blockIdx.x;
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension

const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;

const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;

extern __shared__ half data_soft_max_f16[];
half * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
// (shared memory) buffer to cache values between iterations:
half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
// if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
// in that case col_smem == col_data must be enforced to avoid race conditions

half2 max_val = make_half2(-INFINITY, -INFINITY);

#pragma unroll
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
const int col_smem = vals_smem ? col0 + tid : col_data;

const int ix = rowx*ncols_data + col_data;
const int iy = rowy*ncols_data + col_data;

half2 val;
if (need_check && col_data + 0 >= ncols_data) {
val.x = -INFINITY;
} else {
val.x = x[ix + 0]*scale + (y ? __half2float(y[iy + 0]) : 0.0f);
}
if (need_check && col_data + WARP_SIZE >= ncols_data) {
val.y = -INFINITY;
} else {
val.y = x[ix + WARP_SIZE]*scale + (y ? __half2float(y[iy + WARP_SIZE]) : 0.0f);
}
if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
vals[col_smem] = val;
}
max_val = __hmax2(max_val, val);
}

// find the max value in the block
max_val = warp_reduce_max(max_val);
if (block_size > WARP_SIZE) {
if (warp_id == 0) {
buf_iw[lane_id] = -INFINITY;
}
__syncthreads();

if (lane_id == 0) {
buf_iw[warp_id] = __hmax(max_val.x, max_val.y);
}
__syncthreads();

max_val = __half2half2(buf_iw[lane_id]);
max_val = warp_reduce_max(max_val);
} else {
max_val = __half2half2(__hmax(max_val.x, max_val.y));
}

half2 tmp = make_half2(0.0f, 0.0f); // partial sums

#pragma unroll
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;

if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
break;
}

const half2 val = h2exp(vals[col_smem] - max_val);

tmp += val;
vals[col_smem] = val;
}

// find the sum of exps in the block
tmp = warp_reduce_sum(tmp);
if (block_size > WARP_SIZE) {
if (warp_id == 0) {
buf_iw[lane_id] = 0.0f;
}
__syncthreads();

if (lane_id == 0) {
buf_iw[warp_id] = tmp.x + tmp.y;
}
__syncthreads();

tmp = __half2half2(buf_iw[lane_id]);
tmp = warp_reduce_sum(tmp);
} else {
tmp = __half2half2(tmp.x + tmp.y);
}

const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp;

#pragma unroll
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
const int col_smem = vals_smem ? col0 + tid : col_data;

const int idst = rowx*ncols_data + col_data;
const half2 result = vals[col_smem] * inv_sum;

if (need_check && col_data + 0 >= ncols_data) {
return;
}
dst[idst] = result.x;

if (need_check && col_data + WARP_SIZE >= ncols_data) {
return;
}

dst[idst + WARP_SIZE] = result.y;
}
#else
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
}

template <bool vals_smem, int ncols_template, int block_size_template>
static __global__ void soft_max_f32(const float * x, const half * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
>>>>>>> upstream/gg/flash-attn
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;

const int tid = threadIdx.x;
Expand Down Expand Up @@ -6906,8 +7041,12 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
const int ix = rowx*ncols + col;
const int iy = rowy*ncols + col;

<<<<<<< HEAD
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);

=======
const float val = x[ix]*scale + (y ? __half2float(y[iy]) : 0.0f);
>>>>>>> upstream/gg/flash-attn
vals[col] = val;
max_val = max(max_val, val);
}
Expand Down Expand Up @@ -8448,7 +8587,54 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
}

<<<<<<< HEAD
static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
=======
static void soft_max_f16_cuda(const float * x, const half * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
int nth = WARP_SIZE;
while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
const dim3 block_dims(nth, 1, 1);
const dim3 block_nums(nrows_x, 1, 1);
const size_t shmem = (GGML_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half);
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
if (shmem <= g_device_caps[g_main_device].smpb) {
switch (ncols_x) {
case 32:
soft_max_f16<true, 32, 32, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
break;
case 64:
soft_max_f16<true, 64, 32, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
break;
case 128:
soft_max_f16<true, 128, 64, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
break;
case 256:
soft_max_f16<true, 256, 128, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
break;
case 512:
soft_max_f16<true, 512, 256, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
break;
case 1024:
soft_max_f16<true, 1024, 512, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
break;
case 2048:
soft_max_f16<true, 2048, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
break;
case 4096:
soft_max_f16<true, 4096, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
break;
default:
soft_max_f16<true, 0, 0, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
break;
}
} else {
const size_t shmem_low = WARP_SIZE*sizeof(half);
soft_max_f16<false, 0, 0, true><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
}
}

static void soft_max_f32_cuda(const float * x, const half * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
>>>>>>> upstream/gg/flash-attn
int nth = WARP_SIZE;
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
const dim3 block_dims(nth, 1, 1);
Expand Down Expand Up @@ -9982,7 +10168,7 @@ static void ggml_cuda_op_soft_max(
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);

GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16); // src1 contains mask and it is optional

const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0);
Expand All @@ -9994,6 +10180,7 @@ static void ggml_cuda_op_soft_max(
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));

<<<<<<< HEAD
// positions tensor
float * src2_dd = nullptr;
cuda_pool_alloc<float> src2_f;
Expand All @@ -10011,6 +10198,12 @@ static void ggml_cuda_op_soft_max(
src2_dd = src2_f.alloc(ggml_nelements(src2));
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
}
=======
if (use_f16_soft_max) {
soft_max_f16_cuda(src0_dd, src1 ? (const half *) src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
} else {
soft_max_f32_cuda(src0_dd, src1 ? (const half *) src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
>>>>>>> upstream/gg/flash-attn
}

soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
Expand Down
Loading

0 comments on commit 3ac0fd3

Please sign in to comment.