PaddlePaddle · wanghaoshuang · Feb 16, 2023 · Jan 30, 2023 · Feb 1, 2023 · Feb 2, 2023
diff --git a/python/paddle/nn/quant/format.py b/python/paddle/nn/quant/format.py
@@ -0,0 +1,234 @@
+"""Define some layers used to export quantization model with ONNX style."""
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+from typing import List, Tuple
+
+import paddle
+from paddle import _legacy_C_ops as _C_ops
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.nn import Layer
+
+
+class LinearQuanterDequanter(Layer):
+    def __init__(self, quanter, dequanter):
+        super(LinearQuanterDequanter, self).__init__()
+        self._quanter = quanter
+        self._dequanter = dequanter
+
+    def forward(self, input):
+        out = input
+        if self._quanter is not None:
+            out = self._quanter(out)
+        if self._dequanter is not None:
+            out = self._dequanter(out)
+        return out
+
+    @staticmethod
+    def from_quanter(quanter):
+        return LinearQuanterDequanter(
+            LinearQuanter.from_quanter(quanter),
+            LinearDequanter.from_quanter(quanter),
+        )
+
+
+class LinearQuanter(Layer):
+    def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8):
+        super(LinearQuanter, self).__init__()
+        self._scales = paddle.to_tensor(scales, dtype="float32")
+        self._zero_point = (
+            paddle.zeros([1], dtype="float32")
+            if zero_point is None
+            else paddle.to_tensor(zero_point)
+        )
+        self._quant_axis = -1 if quant_axis is None else quant_axis
+        self._bit_length = bit_length
+
+    def forward(self, input):
+        if in_dygraph_mode():
+            return _C_ops.quantize_linear(
+                input,
+                self._scales,
+                self._zero_point,
+                "quant_axis",
+                self._quant_axis,
+                "bit_length",
+                self._bit_length,
+            )
+        else:
+            out = self._helper.create_variable_for_type_inference(input.dtype)
+            self._helper.append_op(
+                type='quantize_linear',
+                inputs={
+                    'X': input,
+                    'Scale': self._scales,
+                    'ZeroPoint': self._zero_point,
+                },
+                outputs={'Y': out},
+                attrs={
+                    'quant_axis': self._quant_axis,
+                    'bit_length': self._bit_length,
+                },
+            )
+            return out
+
+    @staticmethod
+    def from_quanter(quanter):
+
+        return LinearQuanter(
+            quanter.scales(),
+            zero_point=quanter.zero_points(),
+            quant_axis=quanter.quant_axis(),
+            bit_length=quanter.bit_length(),
+        )
+
+
+class LinearDequanter(Layer):
+    def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8):
+        super(LinearDequanter, self).__init__()
+        self._scales = paddle.to_tensor(scales, dtype="float32")
+        self._zero_point = (
+            paddle.zeros([1], dtype="float32")
+            if zero_point is None
+            else paddle.to_tensor(zero_point)
+        )
+        self._quant_axis = -1 if quant_axis is None else quant_axis
+        self._bit_length = bit_length
+
+    def forward(self, input):
+        if in_dygraph_mode():
+            return _C_ops.dequantize_linear(
+                input,
+                self._scales,
+                self._zero_point,
+                "quant_axis",
+                self._quant_axis,
+                "bit_length",
+                self._bit_length,
+            )
+        else:
+            out = self._helper.create_variable_for_type_inference(input.dtype)
+            self._helper.append_op(
+                type='dequantize_linear',
+                inputs={
+                    'X': input,
+                    'Scale': self._scales,
+                    'ZeroPoint': self._zero_point,
+                },
+                outputs={'Y': out},
+                attrs={
+                    'quant_axis': self._quant_axis,
+                    'bit_length': self._bit_length,
+                },
+            )
+            return out
+
+    @staticmethod
+    def from_quanter(quanter):
+        return LinearDequanter(
+            quanter.scales(),
+            zero_point=quanter.zero_points(),
+            quant_axis=quanter.quant_axis(),
+            bit_length=quanter.bit_length(),
+        )
+
+
+class ConvertibleQuantedLayer(Layer, metaclass=abc.ABCMeta):
+    r"""Abstract class to help convert quantized layer to inference model.
+    It Defined some function convert quanters and observers to quantize or
+    dequantize operators who maitains quantization parameters used during
+    inference.
+    Examples:
+       .. code-block:: python
+
+            # Given codes in ./customized_quanter.py
+            class CustomizedQuantedLayer(ConvertibleQuantedLayer):
+                def __init__(self):
+                    super(CustomizedQuantedLayer, self).__init__()
+                    self.weight_a = paddle.create_parameter(shape=[1], dtype='float32')
+                    self.weight_b = paddle.create_parameter(shape=[1], dtype='float32')
+                    self.quanter_for_weight_a = None
+                    self.activation_weight = None
+                def forward(self, input):
+                    qweight_a = self.quanter_for_weight_a(self.weight_a)
+                    weight_b = self.weight_b
+                    qinput = self.activation_weight(input)
+                    // compute with qweight_a, weight_b and qinput.
+                    return qweight * qinput + weight_b
+
+                def weights_to_quanters(self):
+                    return [('weight_a', 'quanter_for_weight_a')]
+
+                def activation_quanters(self):
+                    return ['activation_weight']
+    """
+
+    def __init__(self):
+        super(ConvertibleQuantedLayer, self).__init__()
+        self.converted = False
+
+    @abc.abstractmethod
+    def weights_to_quanters(self) -> List[Tuple[str, str]]:
+        r"""Get the name pairs of weights to be quantized and corresponding quanters.
+        In convert function of this abstract class, it will call the 'weights_to_quanters' function.
+        And do something as bellow:
+        For each pair, the quanter will be converted to quantize operator and
+        dequantize operator. And then quantize the weight by quantize operator.
+        Finally, remove the quantize operator and store the weights in integer data type.
+
+        Returns: A list of name pairs. Each pair contains two names. The first is name of weight
+        to be quantized and the second is name of corresponding quanter.
+        """
+        pass
+
+    @abc.abstractmethod
+    def activation_quanters(self) -> List[str]:
+        r"""Get the names of quanters used to quantize activations.
+        All the quanters or observers returned by this function will be converted to quantize
+        and dequantize operators for deployment.
+        Returns: A list of quanter names.
+        """
+        pass
+
+    def _convert_quanter_to_qdq(self, quanter_name) -> LinearQuanterDequanter:
+        r"""Convert quanter to an instance of LinearQuanterDequanter."""
+        assert hasattr(
+            self, quanter_name
+        ), f"{quanter_name} is not attribute of current layer."
+        quanter = getattr(self, quanter_name)
+        quanter = LinearQuanterDequanter.from_quanter(quanter)
+        setattr(self, quanter_name, quanter)
+        self._sub_layers[quanter_name] = quanter
+        return quanter
+
+    def _quant_weights(self, weight_name, quanter):
+        r"""Quantize the weight by given quanter."""
+        weight = getattr(self, weight_name)
+        qweight = quanter(weight)
+        weight.set_value(qweight)
+
+    def _convert(self):
+        r"""Convert current layer to onnx style for inference."""
+        assert not self.converted, "The model should be converted only once."
+        for weight_name, quanter_name in self.weights_to_quanters():
+            qdq = self._convert_quanter_to_qdq(quanter_name)
+            self._quant_weights(weight_name, qdq._quanter)
+            qdq._quanter = None
+            qdq._sub_layers['_quanter'] = None
+
+        for quanter_name in self.activation_quanters():
+            self._convert_quanter_to_qdq(quanter_name)
+
+        self.converted = True
diff --git a/python/paddle/nn/quant/qat/conv.py b/python/paddle/nn/quant/qat/conv.py
@@ -17,8 +17,10 @@
 from paddle.nn import Layer
 from paddle.nn import functional as F
 
+from ..format import ConvertibleQuantedLayer
 
-class QuantedConv2D(Layer):
+
+class QuantedConv2D(ConvertibleQuantedLayer):
     """
     The computational logic of QuantizedConv2D is the same with Conv2D.
     The only difference is that its inputs are all fake quantized.
@@ -77,3 +79,9 @@ def _conv_forward(self, inputs, weights):
             groups=self._groups,
             data_format=self._data_format,
         )
+
+    def weights_to_quanters(self):
+        return [('weight', 'weight_quanter')]
+
+    def activation_quanters(self):
+        return ['activation_quanter']
diff --git a/python/paddle/nn/quant/qat/linear.py b/python/paddle/nn/quant/qat/linear.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from paddle.nn import Layer
 from paddle.nn import functional as F
 
+from ..format import ConvertibleQuantedLayer
+
 
-class QuantedLinear(Layer):
+class QuantedLinear(ConvertibleQuantedLayer):
     """
     The computational logic of QuantizedLinear is the same with Linear.
     The only difference is that its inputs are all fake quantized.
@@ -49,3 +52,9 @@ def forward(self, input):
     def _linear_forward(self, input, weight):
         out = F.linear(x=input, weight=weight, bias=self.bias, name=self.name)
         return out
+
+    def weights_to_quanters(self):
+        return [('weight', 'weight_quanter')]
+
+    def activation_quanters(self):
+        return ['activation_quanter']
diff --git a/python/paddle/quantization/__init__.py b/python/paddle/quantization/__init__.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+"""Quantization Module"""
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -49,12 +50,16 @@
 
 from .config import QuantConfig
 from .base_quanter import BaseQuanter
+from .base_observer import BaseObserver
 from .factory import quanter
 from .qat import QAT
+from .ptq import PTQ
 
 __all__ = [
     "QuantConfig",
     "BaseQuanter",
+    "BaseObserver",
     "quanter",
     "QAT",
+    "PTQ",
 ]
diff --git a/python/paddle/quantization/base_observer.py b/python/paddle/quantization/base_observer.py
@@ -0,0 +1,32 @@
+"""Abstract observer class."""
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+
+from .base_quanter import BaseQuanter
+
+
+class BaseObserver(BaseQuanter, metaclass=abc.ABCMeta):
+    r"""
+    Built-in observers and customized observers should extend this base observer
+    and implement abstract methods.
+    """
+
+    def __init__(self):
+        super(BaseObserver, self).__init__()
+
+    @abc.abstractmethod
+    def cal_thresholds(self):
+        pass
diff --git a/python/paddle/quantization/factory.py b/python/paddle/quantization/factory.py
@@ -70,6 +70,9 @@ def _instance(self, layer: Layer) -> BaseQuanter:
         return self.partial_class(layer)
 
 
+ObserverFactory = QuanterFactory
+
+
 def quanter(class_name):
     r"""
     Annotation to declare a factory class for quanter.

diff --git a/python/paddle/quantization/observers/__init__.py b/python/paddle/quantization/observers/__init__.py
@@ -0,0 +1,18 @@
+"""Observers"""
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .abs_max import AbsmaxObserver
+
+__all__ = ["AbsmaxObserver"]