update quantize/dequantize op yaml

PaddlePaddle · Oct 23, 2023 · 748bb9d · 748bb9d
1 parent ea8907e
commit 748bb9d
Show file tree

Hide file tree

Showing 10 changed files with 52 additions and 108 deletions.
diff --git a/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc
@@ -57,9 +57,6 @@ PDNode *patterns::DequantQuantXPUAny::operator()() {
   auto *dequant_in = pattern->NewNode(dequant_in_repr())
                          ->AsInput()
                          ->assert_is_op_input("dequantize_xpu", "x");
-  auto *dequant_max_in = pattern->NewNode(dequant_max_in_repr())
-                             ->AsInput()
-                             ->assert_is_op_input("dequantize_xpu", "max");
 
   auto *dequant_op =
       pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu");
@@ -68,9 +65,6 @@ PDNode *patterns::DequantQuantXPUAny::operator()() {
                           ->AsOutput()
                           ->assert_is_op_output("dequantize_xpu", "y");
 
-  auto *quant_max_in = pattern->NewNode(quant_max_in_repr())
-                           ->assert_is_op_input("quantize_xpu", "max");
-
   auto *quant_op = pattern->NewNode(quant_op_repr())
                        ->assert_is_op("quantize_xpu")
                        ->AsIntermediate();
@@ -81,8 +75,8 @@ PDNode *patterns::DequantQuantXPUAny::operator()() {
 
   auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
 
-  dequant_op->LinksFrom({dequant_in, dequant_max_in}).LinksTo({dequant_out});
-  quant_op->LinksFrom({dequant_out, quant_max_in}).LinksTo({quant_out});
+  dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out});
+  quant_op->LinksFrom({dequant_out}).LinksTo({quant_out});
   next_op->LinksFrom({quant_out});
 
   return quant_out;
@@ -92,18 +86,14 @@ PDNode *patterns::OpDequantXPU::operator()() {
   auto any_op = pattern->NewNode(any_op_repr())->assert_is_op();
   auto *dequant_in = pattern->NewNode(dequant_in_repr())
                          ->assert_is_op_input("dequantize_xpu", "x");
-
-  auto *dequant_max_in = pattern->NewNode(dequant_max_in_repr())
-                             ->AsInput()
-                             ->assert_is_op_input("dequantize_xpu", "max");
   auto *dequant_op =
       pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu");
   auto dequant_out = pattern->NewNode(dequant_out_repr())
                          ->AsOutput()
                          ->assert_is_op_output("dequantize_xpu", "y");
 
   any_op->LinksTo({dequant_in});
-  dequant_op->LinksFrom({dequant_in, dequant_max_in}).LinksTo({dequant_out});
+  dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out});
   return dequant_out;
 }
 

diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -66,31 +66,11 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g,
   quantize_out_node->Var()->SetDataType(
       proto::VarType::Type::VarType_Type_INT8);
 
-  // Create quantize max_ptr node
-  float scale = GetScaleValueForNode(&var_quant_scales_, input);
-  int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
-  std::string input_max_name = input->Name() + "_quantize_max";
-  VarDesc input_max_desc(input_max_name);
-  input_max_desc.SetPersistable(true);
-  input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
-  input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
-  Node* input_max_node = g->CreateVarNode(&input_max_desc);
-  auto input_max_tensor =
-      scope->Var(input_max_name)->GetMutable<phi::DenseTensor>();
-  input_max_tensor->set_type(phi::DataType::FLOAT32);
-  input_max_tensor->Resize({max_ptr_size});
-  auto* cpu_ctx = static_cast<phi::CPUContext*>(
-      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
-  std::vector<float> input_scales(max_ptr_size, scale);
-  memcpy(cpu_ctx->Alloc<float>(input_max_tensor),
-         input_scales.data(),
-         max_ptr_size * sizeof(float));
-
   // Create a quantize op node
+  float scale = GetScaleValueForNode(&var_quant_scales_, input);
   OpDesc q_desc;
   q_desc.SetType("quantize_xpu");
   q_desc.SetInput("x", std::vector<std::string>({input->Name()}));
-  q_desc.SetInput("max", std::vector<std::string>({input_max_name}));
   q_desc.SetOutput("y", std::vector<std::string>({quantize_out_node->Name()}));
   q_desc.SetAttr("out_dtype",
                  static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
@@ -104,7 +84,6 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g,
   // Link quantize op
   UnlinkNodes(input, op);
   IR_NODE_LINK_TO(input, quantize_op);
-  IR_NODE_LINK_TO(input_max_node, quantize_op);
   IR_NODE_LINK_TO(quantize_op, quantize_out_node);
   IR_NODE_LINK_TO(quantize_out_node, op);
 }
@@ -131,32 +110,12 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g,
   dequantize_in_node->Var()->SetDataType(
       proto::VarType::Type::VarType_Type_INT8);
 
-  // Create dequantize max_ptr node
   float scale = GetScaleValueForNode(&var_quant_scales_, output);
-  int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
-  std::string input_max_name = output->Name() + "_dequantize_max";
-  VarDesc input_max_desc(input_max_name);
-  input_max_desc.SetPersistable(true);
-  input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
-  input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
-  Node* input_max_node = g->CreateVarNode(&input_max_desc);
-  auto input_max_tensor =
-      scope->Var(input_max_name)->GetMutable<phi::DenseTensor>();
-  input_max_tensor->set_type(phi::DataType::FLOAT32);
-  input_max_tensor->Resize({max_ptr_size});
-  auto* cpu_ctx = static_cast<phi::CPUContext*>(
-      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
-  std::vector<float> input_scales(max_ptr_size, scale);
-  memcpy(cpu_ctx->Alloc<float>(input_max_tensor),
-         input_scales.data(),
-         max_ptr_size * sizeof(float));
-
   // Create a quantize op node
   OpDesc deq_desc;
   deq_desc.SetType("dequantize_xpu");
   deq_desc.SetInput("x",
                     std::vector<std::string>({dequantize_in_node->Name()}));
-  deq_desc.SetInput("max", std::vector<std::string>({input_max_name}));
   deq_desc.SetOutput("y", std::vector<std::string>({output->Name()}));
   deq_desc.SetAttr("out_dtype", static_cast<int>(output->Var()->GetDataType()));
   deq_desc.SetAttr("scale", static_cast<float>(scale));
@@ -170,7 +129,6 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g,
   UnlinkNodes(op, output);
   IR_NODE_LINK_TO(op, dequantize_in_node);
   IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
-  IR_NODE_LINK_TO(input_max_node, dequantize_op);
   IR_NODE_LINK_TO(dequantize_op, output);
 }
 

diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
@@ -59,7 +59,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash(
     Graph* graph,
     std::unordered_map<const Node*, int>* nodes_keep_counter) const {
   GraphPatternDetector gpd;
-  LOG(INFO) << "DequantQuantSquash COME IN";
   patterns::DequantQuantXPUAny squash_pattern{gpd.mutable_pattern(),
                                               "dequant_quant_xpu_any"};
   squash_pattern();
@@ -90,7 +89,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash(
     // check if dequantize op should be kept or removed, decrease the counter
     bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
 
-    int equal = dequant_scale == quant_scale ? 1 : 0;
     if (dequant_scale == quant_scale) {
       // squash dequantize-quantize to nothing
       auto quant_out_var_name = quant_out->Name();

diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
@@ -658,14 +658,13 @@
   backward : depthwise_conv2d_grad
 
 - op : dequantize_xpu
-  args : (Tensor x, Tensor max, DataType out_dtype, float scale = 1.0f)
+  args : (Tensor x, DataType out_dtype, float scale = 1.0f)
   output : Tensor(y)
   infer_meta :
     func : DeQuantizeXPUInferMeta
   kernel :
     func : dequantize_xpu
     data_type: x
-  optional : max
 
 - op : det
   args : (Tensor x)
@@ -2050,14 +2049,13 @@
   backward : qr_grad
 
 - op : quantize_xpu
-  args : (Tensor x, Tensor max, DataType out_dtype, float scale = 1.0f)
+  args : (Tensor x, DataType out_dtype, float scale = 1.0f)
   output : Tensor(y)
   infer_meta :
     func : QuantizeXPUInferMeta
   kernel :
     func : quantize_xpu
     data_type : x
-  optional : max
 
 - op : real
   args : (Tensor x)

diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
@@ -978,16 +978,6 @@ void DepthwiseConvInferMeta(const MetaTensor& input,
                 config);
 }
 
-void DeQuantizeXPUInferMeta(const MetaTensor& x,
-                            const MetaTensor& max,
-                            DataType out_dtype,
-                            float scale,
-                            MetaTensor* y) {
-  auto x_dims = x.dims();
-  y->set_dims(x_dims);
-  y->set_dtype(out_dtype);
-}
-
 void DistInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    float p,
@@ -2607,16 +2597,6 @@ void PriorBoxInferMeta(const MetaTensor& input,
   var->set_dims(phi::make_ddim(dim_vec));
 }
 
-void QuantizeXPUInferMeta(const MetaTensor& x,
-                          const MetaTensor& max,
-                          DataType out_dtype,
-                          float scale,
-                          MetaTensor* y) {
-  auto x_dims = x.dims();
-  y->set_dims(x_dims);
-  y->set_dtype(out_dtype);
-}
-
 void RepeatInterleaveWithTensorIndexInferMeta(const MetaTensor& x,
                                               const MetaTensor& repeats,
                                               int dim,

diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
@@ -155,12 +155,6 @@ void DepthwiseConvInferMeta(const MetaTensor& input,
                             MetaTensor* out,
                             MetaConfig config = MetaConfig());
 
-void DeQuantizeXPUInferMeta(const MetaTensor& x,
-                            const MetaTensor& max,
-                            DataType out_dtype,
-                            float scale,
-                            MetaTensor* y);
-
 void DistInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    float p,
@@ -414,12 +408,6 @@ void PriorBoxInferMeta(const MetaTensor& input,
                        MetaTensor* out,
                        MetaTensor* var);
 
-void QuantizeXPUInferMeta(const MetaTensor& x,
-                          const MetaTensor& max,
-                          DataType out_dtype,
-                          float scale,
-                          MetaTensor* y);
-
 void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
                            const MetaTensor& value,
                            bool out_int32,

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
@@ -672,6 +672,15 @@ void DecodeJpegInferMeta(const MetaTensor& x,
   }
 }
 
+void DeQuantizeXPUInferMeta(const MetaTensor& x,
+                            DataType out_dtype,
+                            float scale,
+                            MetaTensor* y) {
+  auto x_dims = x.dims();
+  y->set_dims(x_dims);
+  y->set_dtype(out_dtype);
+}
+
 void DiagEmbedInferMeta(
     const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out) {
   auto x_dims = x.dims();
@@ -3768,6 +3777,15 @@ void FillSplitOutDims(const MetaTensor& x,
   }
 }
 
+void QuantizeXPUInferMeta(const MetaTensor& x,
+                          DataType out_dtype,
+                          float scale,
+                          MetaTensor* y) {
+  auto x_dims = x.dims();
+  y->set_dims(x_dims);
+  y->set_dtype(out_dtype);
+}
+
 void SplitInferMeta(const MetaTensor& x,
                     const IntArray& sections,
                     const Scalar& axis,

diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
@@ -145,6 +145,11 @@ void DecodeJpegInferMeta(const MetaTensor& x,
                          const std::string& mode,
                          MetaTensor* out);
 
+void DeQuantizeXPUInferMeta(const MetaTensor& x,
+                            DataType out_dtype,
+                            float scale,
+                            MetaTensor* y);
+
 void DiagEmbedInferMeta(
     const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out);
 
@@ -453,6 +458,11 @@ void QrInferMeta(const MetaTensor& x,
                  MetaTensor* q,
                  MetaTensor* r);
 
+void QuantizeXPUInferMeta(const MetaTensor& x,
+                          DataType out_dtype,
+                          float scale,
+                          MetaTensor* y);
+
 void WeightQuantizeInferMeta(const MetaTensor& x,
                              const std::string& algo,
                              MetaTensor* out,

diff --git a/paddle/phi/kernels/xpu/dequantization_kernel.cc b/paddle/phi/kernels/xpu/dequantization_kernel.cc
@@ -19,17 +19,20 @@ namespace phi {
 template <typename TX, typename TY, typename Context>
 void DeQuantizeKernelImpl(const Context& ctx,
                           const DenseTensor& x,
-                          const paddle::optional<DenseTensor>& max,
+                          float scale,
                           DenseTensor* y) {
   using XPUInX = typename XPUTypeTrait<TX>::Type;
   using XPUOutY = typename XPUTypeTrait<TY>::Type;
 
   auto* y_data = ctx.template Alloc<TY>(y);
   const auto* x_data = x.data<TX>();
   int64_t len = x.numel();
-  const float* max_data =
-      max.get_ptr() == nullptr ? nullptr : max->data<float>();
-  int r = xpu::dequantization<XPUInX, XPUOutY>(
+  int max_ptr_size = ctx.x_context()->max_ptr_size();
+  xpu::ctx_guard RAII_GUARD(ctx.x_context());
+  auto max_data = RAII_GUARD.alloc_l3_or_gm<float>(max_ptr_size);
+  int r = xpu::constant<float>(ctx.x_context(), max_data, max_ptr_size, scale);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+  r = xpu::dequantization<XPUInX, XPUOutY>(
       ctx.x_context(),
       reinterpret_cast<const XPUInX*>(x_data),
       reinterpret_cast<XPUOutY*>(y_data),
@@ -41,16 +44,15 @@ void DeQuantizeKernelImpl(const Context& ctx,
 template <typename T, typename Context>
 void DeQuantizeKernel(const Context& ctx,
                       const DenseTensor& x,
-                      const paddle::optional<DenseTensor>& max,
                       DataType out_dtype,
                       float scale,
                       DenseTensor* y) {
   switch (out_dtype) {
     case DataType::FLOAT32:
-      DeQuantizeKernelImpl<T, float, Context>(ctx, x, max, y);
+      DeQuantizeKernelImpl<T, float, Context>(ctx, x, scale, y);
       break;
     case DataType::FLOAT16:
-      DeQuantizeKernelImpl<T, dtype::float16, Context>(ctx, x, max, y);
+      DeQuantizeKernelImpl<T, dtype::float16, Context>(ctx, x, scale, y);
       break;
     default:
       PADDLE_THROW(phi::errors::Unavailable(

diff --git a/paddle/phi/kernels/xpu/quantization_kernel.cc b/paddle/phi/kernels/xpu/quantization_kernel.cc
@@ -19,17 +19,20 @@ namespace phi {
 template <typename TX, typename TY, typename Context>
 void QuantizeKernelImpl(const Context& ctx,
                         const DenseTensor& x,
-                        const paddle::optional<DenseTensor>& max,
+                        float scale,
                         DenseTensor* y) {
   using XPUInX = typename XPUTypeTrait<TX>::Type;
   using XPUOutY = typename XPUTypeTrait<TY>::Type;
 
   auto* y_data = ctx.template Alloc<TY>(y);
   const auto* x_data = x.data<TX>();
   int64_t len = x.numel();
-  const float* max_data =
-      max.get_ptr() == nullptr ? nullptr : max->data<float>();
-  int r = xpu::quantization<XPUInX, XPUOutY>(
+  int max_ptr_size = ctx.x_context()->max_ptr_size();
+  xpu::ctx_guard RAII_GUARD(ctx.x_context());
+  auto max_data = RAII_GUARD.alloc_l3_or_gm<float>(max_ptr_size);
+  int r = xpu::constant<float>(ctx.x_context(), max_data, max_ptr_size, scale);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+  r = xpu::quantization<XPUInX, XPUOutY>(
       ctx.x_context(),
       reinterpret_cast<const XPUInX*>(x_data),
       reinterpret_cast<XPUOutY*>(y_data),
@@ -41,16 +44,15 @@ void QuantizeKernelImpl(const Context& ctx,
 template <typename T, typename Context>
 void QuantizeKernel(const Context& ctx,
                     const DenseTensor& x,
-                    const paddle::optional<DenseTensor>& max,
                     DataType out_dtype,
                     float scale,
                     DenseTensor* y) {
   switch (out_dtype) {
     case DataType::INT16:
-      QuantizeKernelImpl<T, int16_t, Context>(ctx, x, max, y);
+      QuantizeKernelImpl<T, int16_t, Context>(ctx, x, scale, y);
       break;
     case DataType::INT8:
-      QuantizeKernelImpl<T, int8_t, Context>(ctx, x, max, y);
+      QuantizeKernelImpl<T, int8_t, Context>(ctx, x, scale, y);
       break;
     default:
       PADDLE_THROW(phi::errors::Unavailable(