Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LinearInt8 layer for inference of int8-quantized LLMs and Arm intrinsics #5007

Closed
wants to merge 14 commits into from
3 changes: 3 additions & 0 deletions docs/developer-guide/operation-param-weight-table.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@
||2|width_scale|1.f|
||3|output_height|0|
||4|output_width|0|
|LinearInt8|0|in_dim|0|scale weight|
||1|out_dim|0|
||2|group_size|0|
|Log|0|base|-1.f|
||1|scale|1.f|
||2|shift|0.f|
Expand Down
19 changes: 19 additions & 0 deletions docs/developer-guide/operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
* [InstanceNorm](#instancenorm)
* [Interp](#interp)
* [LayerNorm](#layernorm)
* [LinearInt8](#linearint8)
* [Log](#log)
* [LRN](#lrn)
* [LSTM](#lstm)
Expand Down Expand Up @@ -1104,6 +1105,24 @@ y = x * gamma + beta by elementwise
| gamma_data | float | [affine_size] |
| beta_data | float | [affine_size] |

# LinearInt8
```
y = x (WS)^T
```

* one_blob_only

| param id | name | type | default | description |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0 | in_dim | int | 0 | |
| 1 | out_dim | int | 0 | |
| 2 | group_size | int | 0 | |

| weight | type | shape |
| ------------- | ----- | --------------------- |
| scale | float | [in_dim * out_dim / group_size] |
| weight | int8 | [in_dim, out_dim] |

# Log
```
if base == -1 y = log(shift + x * scale)
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ ncnn_add_layer(CopyTo)
ncnn_add_layer(Erf)
ncnn_add_layer(Diag)
ncnn_add_layer(CELU)
ncnn_add_layer(LinearInt8)

if(NCNN_VULKAN)
ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
Expand Down
91 changes: 91 additions & 0 deletions src/layer/arm/linearint8_arm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#include "linearint8_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

int LinearInt8_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
if (bottom_blob.dims != 2 || bottom_blob.w != in_dim)
return -1;

int w = bottom_blob.w;
int h = bottom_blob.h;
size_t elemsize = bottom_blob.elemsize;

top_blob.create(out_dim, h, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

const int8_t* wt = (const int8_t*)weight;

#if (__ARM_NEON && __aarch64__)

float zero = 0.0f;

if (!(w % group_size) && !(group_size % 8))
{
for (int j = 0; j < h; j++)
{
const float* m = bottom_blob.row(j);
float* out = top_blob.row(j);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < out_dim; p++)
{
int base = w * p;
float32x4_t acc_p0 = vld1q_dup_f32(&zero), acc_p1 = vld1q_dup_f32(&zero);
for (int k = 0; k < w; k += group_size)
{
int scales_index = (base + k) / group_size;
int index = base + k;
const float* sc = (const float*)scales + scales_index;
for (int i = 0, ind = index; i < group_size; i += 8, ind += 8)
{
int8x8_t i8x8 = vld1_s8(wt + ind);
int16x8_t i16x8 = vmovl_s8(i8x8);
int32x4_t i32_0 = vmovl_s16(vget_low_s16(i16x8));
int32x4_t i32_1 = vmovl_s16(vget_high_s16(i16x8));
float32x4_t wt_p0 = vcvtq_f32_s32(i32_0);
float32x4_t wt_p1 = vcvtq_f32_s32(i32_1);
float32x4_t m_p0 = vld1q_f32(m + k + i);
float32x4_t m_p1 = vld1q_f32(m + k + i + 4);
float32x4_t sc_p = vld1q_dup_f32(sc);
float32x4_t acc_real0 = vmulq_f32(wt_p0, sc_p);
float32x4_t acc_real1 = vmulq_f32(wt_p1, sc_p);
acc_p0 = vmlaq_f32(acc_p0, m_p0, acc_real0);
acc_p1 = vmlaq_f32(acc_p1, m_p1, acc_real1);
}
}
out[p] = vaddvq_f32(acc_p0) + vaddvq_f32(acc_p1);
}
}
return 0;
}
#endif

for (int j = 0; j < h; j++)
{
const float* m = bottom_blob.row(j);
float* out = top_blob.row(j);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < out_dim; p++)
{
int base = w * p;
float acc = 0.0f;
for (int i = 0, index = base, scales_index = index / group_size; i < w; i++, index++)
{
acc += m[i] * wt[index] * scales[scales_index];
if (index % group_size == group_size - 1) scales_index++;
}
out[p] = acc;
}
}

return 0;
}

} // namespace ncnn
31 changes: 31 additions & 0 deletions src/layer/arm/linearint8_arm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_LINEARINT8_ARM_H
#define LAYER_LINEARINT8_ARM_H

#include "net.h"
#include "linearint8.h"

namespace ncnn {

class LinearInt8_arm : virtual public LinearInt8
{
public:
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_LINEARINT8_H
81 changes: 81 additions & 0 deletions src/layer/linearint8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "linearint8.h"

namespace ncnn {

LinearInt8::LinearInt8()
{
one_blob_only = true;
support_inplace = false;
}

int LinearInt8::load_param(const ParamDict& pd)
{
in_dim = pd.get(0, 0);
out_dim = pd.get(1, 0);
group_size = pd.get(2, 1);
if (in_dim * out_dim % group_size)
return -1;
return 0;
}

int LinearInt8::load_model(const ModelBin& mb)
{
scales = mb.load(in_dim * out_dim / group_size, 1);
weight = mb.load(in_dim * out_dim, 0);
if (weight.elemsize != 1)
return -1;
return 0;
}

int LinearInt8::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
if (bottom_blob.dims != 2 || bottom_blob.w != in_dim)
return -1;

int w = bottom_blob.w;
int h = bottom_blob.h;
size_t elemsize = bottom_blob.elemsize;

top_blob.create(out_dim, h, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;

const int8_t* wt = (const int8_t*)weight;

for (int j = 0; j < h; j++)
{
const float* m = bottom_blob.row(j);
float* out = top_blob.row(j);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < out_dim; p++)
{
int base = w * p;
float acc = 0.0f;
for (int i = 0; i < w; i++)
{
int index = base + i;
acc += m[i] * wt[index] * scales[index / group_size];
}
out[p] = acc;
}
}

return 0;
}

} // namespace ncnn
42 changes: 42 additions & 0 deletions src/layer/linearint8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_LINEARINT8_H
#define LAYER_LINEARINT8_H

#include "layer.h"

namespace ncnn {

class LinearInt8 : public Layer
{
public:
LinearInt8();

virtual int load_param(const ParamDict& pd);

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

int in_dim;
int out_dim;
int group_size;
Mat weight;
Mat scales;
};

} // namespace ncnn

#endif // LAYER_LINEARINT8_H
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ ncnn_add_layer_test(InnerProduct)
ncnn_add_layer_test(InstanceNorm)
ncnn_add_layer_test(Interp)
ncnn_add_layer_test(LayerNorm)
ncnn_add_layer_test(LinearInt8)
ncnn_add_layer_test(LRN)
ncnn_add_layer_test(LSTM)
ncnn_add_layer_test(MatMul)
Expand Down
73 changes: 73 additions & 0 deletions tests/test_linearint8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "layer/linearint8.h"
#include "testutil.h"

static int test_linearint8(const ncnn::Mat& a, int in_dim, int out_dim, int group_size)
{
if (in_dim * out_dim % group_size)
{
fprintf(stderr, "malformed test case: in_dim=%d out_dim=%d group_size=%d\n", in_dim, out_dim, group_size);
return -1;
}
if (a.w != in_dim)
{
fprintf(stderr, "malformed test case: in_dim=%d out_dim=%d group_size=%d\n", in_dim, out_dim, group_size);
return -1;
}
ncnn::ParamDict pd;
pd.set(0, in_dim);
pd.set(1, out_dim);
pd.set(2, group_size);

std::vector<ncnn::Mat> weights(2);
weights[0] = RandomMat(in_dim * out_dim / group_size);
weights[1] = RandomS8Mat(in_dim * out_dim);

int ret = test_layer<ncnn::LinearInt8>("LinearInt8", pd, weights, a);
if (ret != 0)
{
fprintf(stderr, "test_linearint8 failed a.dims=%d a=(%d, %d) in_dim=%d out_dim=%d group_size=%d\n", a.dims, a.h, a.w, in_dim, out_dim, group_size);
}

return ret;
}

static int test_linearint8_0()
{
ncnn::Mat a = RandomMat(10, 1);

return 0
|| test_linearint8(a, 10, 6, 4)
|| test_linearint8(a, 10, 8, 4)
|| test_linearint8(a, 10, 10, 4);
}

static int test_linearint8_1()
{
ncnn::Mat a = RandomMat(16, 1);

return 0
|| test_linearint8(a, 16, 6, 16);
}

int main()
{
SRAND(7767517);

return 0
|| test_linearint8_0()
|| test_linearint8_1();
}
Loading
Loading