Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : multi-thread ggml_rope() (~3-4 times faster on M1) #781

Merged
merged 1 commit into from
Apr 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -7238,7 +7238,6 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3);

Expand All @@ -7265,11 +7264,28 @@ static void ggml_compute_forward_rope_f32(

assert(nb0 == sizeof(float));

// TODO: optimize
const int ith = params->ith;
const int nth = params->nth;

const int nr = ggml_nrows(src0);

// rows per thread
const int dr = (nr + nth - 1)/nth;

// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);

// row index used to determine which thread to use
int ir = 0;

for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
const int p = (mode == 0 ? n_past + i2 : i2);
for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue;
if (ir > ir1) break;

for (int i0 = 0; i0 < n_dims; i0 += 2) {
const float theta = powf(10000.0, ((float)-i0)/n_dims);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

theta can be calculated as theta *= factor in each loop. factor can be calculated out of the outer loop factor = powf(10000.0, ((float)-2)/n_dims); the initial theta = p.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Open a PR if you observe performance gain improvement


Expand All @@ -7295,7 +7311,6 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3);

Expand All @@ -7322,10 +7337,28 @@ static void ggml_compute_forward_rope_f16(

assert(nb0 == sizeof(ggml_fp16_t));

const int ith = params->ith;
const int nth = params->nth;

const int nr = ggml_nrows(src0);

// rows per thread
const int dr = (nr + nth - 1)/nth;

// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);

// row index used to determine which thread to use
int ir = 0;

for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
const int p = (mode == 0 ? n_past + i2 : i2);
for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue;
if (ir > ir1) break;

for (int i0 = 0; i0 < n_dims; i0 += 2) {
const float theta = powf(10000.0, ((float)-i0)/n_dims);

Expand Down Expand Up @@ -9424,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} break;
case GGML_OP_ROPE:
{
node->n_tasks = 1;
node->n_tasks = n_threads;
} break;
case GGML_OP_CONV_1D_1S:
case GGML_OP_CONV_1D_2S:
Expand Down