Skip to content

Commit

Permalink
Fixed out-of-bounds memory accesses
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGaessler committed Jul 15, 2023
1 parent bca8a68 commit d8e2697
Showing 1 changed file with 24 additions and 15 deletions.
39 changes: 24 additions & 15 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1232,19 +1232,23 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
v.y = x[ib + iqs + 1];
}

static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
const int ix = blockDim.x*blockIdx.x + threadIdx.x;

if (i >= k) {
if (ix >= kx_padded) {
return;
}

const int iy = blockDim.y*blockIdx.y + threadIdx.y;

const int i_padded = iy*kx_padded + ix;

block_q8_1 * y = (block_q8_1 *) vy;

const int ib = i / QK8_1; // block index
const int iqs = i % QK8_1; // quant index
const int ib = i_padded / QK8_1; // block index
const int iqs = i_padded % QK8_1; // quant index

const float xi = i < ndata ? x[i] : 0.0f;
const float xi = ix < kx_padded ? x[iy*kx + ix] : 0.0f;
float amax = fabsf(xi);
float sum = xi;

Expand Down Expand Up @@ -1779,12 +1783,14 @@ static __global__ void mul_mat_q(
const int iqsy = sizeof(int) * (tid_x % QI8_1);

for (int i = 0; i < WARP_SIZE; i += 8) {
const block_q8_1 * __restrict__ by0 = &y[(col_y_0 + tid_y + i)*blocks_per_row + ib0 + iby0];
const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses

const block_q8_1 * __restrict__ by0 = &y[col_y_eff*blocks_per_row + ib0 + iby0];

tile_y_qs[(tid_y + i) * (2*WARP_SIZE) + tid_x] = *((int *) &by0->qs[iqsy]);
tile_y_ds[(tid_y + i) * (2*WARP_SIZE/QI8_1) + iby0] = by0->ds;

const block_q8_1 * __restrict__ by1 = &y[(col_y_0 + tid_y + i)*blocks_per_row + ib0 + iby1];
const block_q8_1 * __restrict__ by1 = &y[col_y_eff*blocks_per_row + ib0 + iby1];

tile_y_qs[(tid_y + i) * (2*WARP_SIZE) + tid_x + WARP_SIZE] = *((int *) &by1->qs[iqsy]);
tile_y_ds[(tid_y + i) * (2*WARP_SIZE/QI8_1) + iby1] = by1->ds;
Expand Down Expand Up @@ -2215,9 +2221,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
}

static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, ny, 1);
const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
}

static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
Expand Down Expand Up @@ -2962,6 +2970,7 @@ inline void ggml_cuda_op_mul_mat_q(

const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(ne10 % QK8_1 == 0);

const int64_t ne0 = dst->ne[0];

Expand All @@ -2974,11 +2983,11 @@ inline void ggml_cuda_op_mul_mat_q(
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;

int64_t padded_row_size = ne10*ne11 + MATRIX_ROW_PADDING - 1;
int64_t padded_row_size = ne10 + MATRIX_ROW_PADDING - 1;
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
size_t as;
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10*ne11, padded_row_size, cudaStream_main);
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);

switch (src0->type) {
case GGML_TYPE_Q4_0:
Expand Down Expand Up @@ -3042,7 +3051,7 @@ inline void ggml_cuda_op_mul_mat_vec(
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
size_t as;
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);

switch (src0->type) {
case GGML_TYPE_Q4_0:
Expand Down

0 comments on commit d8e2697

Please sign in to comment.