Skip to content

Commit

Permalink
add debug
Browse files Browse the repository at this point in the history
  • Loading branch information
FSSRepo committed Apr 12, 2024
1 parent 53eade5 commit b24e7f7
Showing 1 changed file with 55 additions and 15 deletions.
70 changes: 55 additions & 15 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2201,6 +2201,30 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
static long times[GGML_OP_COUNT];
static long count[GGML_OP_COUNT];

static int layer = 0;
static bool compute = false;

static void print_gpu_tensor(ggml_tensor* tensor, const char* name) {
printf("%s-[%4d, %4d, %4d] -> (", name, tensor->ne[0], tensor->ne[1], tensor->ne[2]);
void* data = malloc(ggml_nbytes(tensor));
ggml_backend_tensor_get(tensor, data, 0, ggml_nbytes(tensor));
for(int i = 0; i < 16; i++) {
int data_index = i > 7 ? ggml_nelements(tensor) + i - 16 : i;
float data_ = tensor->type == GGML_TYPE_F32 ?
((float*)data)[data_index] : __half2float(((half*)data)[data_index]);
if(i == 8) {
printf(", ...");
}
if(i == 0) {
printf("%3.2f", data_);
} else {
printf(", %3.2f", data_);
}
}
printf(")\n");
free(data);
}

static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
// why is this here instead of mul_mat?
if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
Expand Down Expand Up @@ -2354,24 +2378,40 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
return false;
}
cudaStreamSynchronize(ctx.streams[ctx.device][0]);
times[dst->op] += ggml_time_us() - start;
count[dst->op]++;
if(strcmp(ggml_get_name(dst), "result_output") == 0) {
printf("============================ cuda timings ==========================\n");
float total_time = 0.0f;
for(int i = 0; i < GGML_OP_COUNT;i++) {
if(count[i] > 0) {
total_time += times[i] / 1000.0f;
}
if(dst->op == GGML_OP_FLASH_ATTN_EXT) {
if(compute) {
printf("==============> flash_attn %i <==============\n\n", layer);
print_gpu_tensor(dst->src[0], "query");
print_gpu_tensor(dst->src[1], "key");
print_gpu_tensor(dst->src[2], "value");
print_gpu_tensor(dst, "kqv");
}
for(int i = 0; i < GGML_OP_COUNT;i++) {
if(count[i] > 0) {
float t = times[i] / 1000.0f;
printf("%3d | %15s | %4.3f ms | %2.2f %%\n", count[i], ggml_op_name((ggml_op)i), t, (t / total_time) * 100.f);
}
layer ++;
}
if(strcmp(ggml_get_name(dst), "result_output") == 0) {
if(!compute) {
compute = true;
}
printf("%3d | %15s | %4.3f ms | %2.2f %%\n", 0, "Total", total_time, 100.f);
layer %= 31;
}
times[dst->op] += ggml_time_us() - start;
count[dst->op]++;
// if(strcmp(ggml_get_name(dst), "result_output") == 0) {
// printf("============================ cuda timings ==========================\n");
// float total_time = 0.0f;
// for(int i = 0; i < GGML_OP_COUNT;i++) {
// if(count[i] > 0) {
// total_time += times[i] / 1000.0f;
// }
// }
// for(int i = 0; i < GGML_OP_COUNT;i++) {
// if(count[i] > 0) {
// float t = times[i] / 1000.0f;
// printf("%3d | %15s | %4.3f ms | %2.2f %%\n", count[i], ggml_op_name((ggml_op)i), t, (t / total_time) * 100.f);
// }
// }
// printf("%3d | %15s | %4.3f ms | %2.2f %%\n", 0, "Total", total_time, 100.f);
// }
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
Expand Down

0 comments on commit b24e7f7

Please sign in to comment.