apache · pengzhao-intel · Dec 26, 2019 · Dec 16, 2019 · Dec 16, 2019 · Dec 23, 2019
diff --git a/src/operator/quantization/quantized_elemwise_mul-inl.h b/src/operator/quantization/quantized_elemwise_mul-inl.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file quantized_elemwise_mul.cc
+ * \brief CPU Implementation of basic elementwise binary mul operators
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
+
+#include "../tensor/elemwise_binary_op-inl.h"
+
+namespace mxnet {
+namespace op {
+/* These structure is used for requantization only when fusion */
+struct QuantizeElemwiseMulParam : public dmlc::Parameter<QuantizeElemwiseMulParam> {
+  dmlc::optional<float> min_calib_range;
+  dmlc::optional<float> max_calib_range;
+  bool enable_float_output;
+  DMLC_DECLARE_PARAMETER(QuantizeElemwiseMulParam) {
+    DMLC_DECLARE_FIELD(min_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The minimum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int8 output data.");
+    DMLC_DECLARE_FIELD(max_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The maximum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to requantize the "
+              "int8 output data.");
+    DMLC_DECLARE_FIELD(enable_float_output).set_default(false)
+    .describe("Whether to enable float32 output");
+  }
+};
+
+namespace quantized_elemwise_mul {
+enum QuantizedElemwiseMulOpInputs {kLhs, kRhs, kLhsMin, kLhsMax, kRhsMin, kRhsMax};
+enum QuantizedElemwiseMulOpOutputs {kOut, kOutMin, kOutMax};
+enum QuantizedElemwiseMulOpResource {kTempSpace};
+}  // namespace quantized embedding
+
+
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_MUL_INL_H_
diff --git a/src/operator/quantization/quantized_elemwise_mul.cc b/src/operator/quantization/quantized_elemwise_mul.cc
@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file quantized_elemwise_mul.cc
+ * \brief CPU Implementation of basic elementwise binary mul operators
+ */
+#include <mxnet/op_attr_types.h>
+#include "../tensor/elemwise_binary_op-inl.h"
+#include "./quantized_elemwise_mul-inl.h"
+#include "./quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(QuantizeElemwiseMulParam);
+
+static std::vector<std::string> QuantizedElemwiseMulOutputNames(const NodeAttrs &attrs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  if (params.enable_float_output)
+    return std::vector<std::string>{"output"};
+  else
+    return std::vector<std::string>{"output", "min_output", "max_output"};
+}
+
+inline bool QuantizedElemwiseMulOpShape(const nnvm::NodeAttrs& attrs,
+                                        mxnet::ShapeVector *in_attrs,
+                                        mxnet::ShapeVector *out_attrs) {
+  using namespace mshadow;
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  const mxnet::TShape &lshape = (*in_attrs)[quantized_elemwise_mul::kLhs];
+  const mxnet::TShape &rshape = (*in_attrs)[quantized_elemwise_mul::kRhs];
+  if (!ndim_is_known(lshape) || !ndim_is_known(rshape)) return false;
+  CHECK_EQ(lshape.ndim(), rshape.ndim()) << "Currently, quantized elemwise multiply doesn't support broadcast.";
+  for (int i = 0; i < lshape.ndim(); ++i) {
+    CHECK_EQ(lshape[i], rshape[i]);
+  }
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kLhsMin, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kLhsMax, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kRhsMin, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*in_attrs, quantized_elemwise_mul::kRhsMax, mxnet::TShape(1, 1));
+
+  out_attrs->clear();
+  mxnet::TShape oshape(lshape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOut, oshape);
+  if (!params.enable_float_output) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOutMin, mxnet::TShape(1, 1));
+    SHAPE_ASSIGN_CHECK(*out_attrs, quantized_elemwise_mul::kOutMax, mxnet::TShape(1, 1));
+  }
+  return true;
+}
+
+inline bool QuantizedElemwiseMulOpType(const nnvm::NodeAttrs& attrs,
+                                       std::vector<int> *in_type,
+                                       std::vector<int> *out_type) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  for (int i = 0; i < 2; ++i) {
+    if (in_type->at(i) == mshadow::kInt8) {
+      TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kInt8);
+    } else {
+      LOG(ERROR) << "currently, quantized elemwise mul only support int8 inputs.";
+    }
+  }
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kLhsMin, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kLhsMax, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kRhsMin, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, quantized_elemwise_mul::kRhsMax, mshadow::kFloat32);
+
+  int dtype = mshadow::kInt32;
+  if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+    dtype = mshadow::kInt8;
+  }
+  if (!params.enable_float_output) {
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOut, dtype);
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOutMin, mshadow::kFloat32);
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOutMax, mshadow::kFloat32);
+  } else {
+    TYPE_ASSIGN_CHECK(*out_type, quantized_elemwise_mul::kOut, mshadow::kFloat32);
+  }
+  return true;
+}
+
+inline bool QuantizedElemwiseMulOpStorageType(const nnvm::NodeAttrs& attrs,
+                                              int dev_mask,
+                                              DispatchMode* dispatch_mode,
+                                              std::vector<int> *in_attrs,
+                                              std::vector<int> *out_attrs) {
+  using namespace common;
+  *dispatch_mode = DispatchMode::kFCompute;
+
+  for (auto &v : *out_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+
+  for (auto &v : *in_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void QuantizedElemwiseMulOpForward(const nnvm::NodeAttrs &attrs,
+                                   const OpContext &ctx,
+                                   const std::vector<TBlob> &inputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<TBlob> &outputs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  using namespace mxnet_op;
+
+  float lhs_min = inputs[quantized_elemwise_mul::kLhsMin].dptr<float>()[0];
+  float lhs_max = inputs[quantized_elemwise_mul::kLhsMax].dptr<float>()[0];
+  float rhs_min = inputs[quantized_elemwise_mul::kRhsMin].dptr<float>()[0];
+  float rhs_max = inputs[quantized_elemwise_mul::kRhsMax].dptr<float>()[0];
+
+  float cached_output_min_ = 0.f;
+  float cached_output_max_ = 0.f;
+  float out_data_scale = 1.f;
+  float out_scale = 1.f;
+  if (!params.enable_float_output) {
+    float output_data_range = kInt32Range;
+    // dataA && dataB are int8
+    if (outputs[quantized_elemwise_mul::kOut].type_flag_ == mshadow::kInt8) {
+      output_data_range = kInt8Range;
+    } else {
+      output_data_range = kInt32Range;
+    }
+    if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+      cached_output_min_ = params.min_calib_range.value();
+      cached_output_max_ = params.max_calib_range.value();
+      out_data_scale = output_data_range / MaxAbs(cached_output_min_, cached_output_max_);
+      auto lhs_scale = kInt8Range / MaxAbs(lhs_min, lhs_max);
+      auto rhs_scale = kInt8Range / MaxAbs(rhs_min, rhs_max);
+      out_scale = out_data_scale / lhs_scale / rhs_scale;
+    } else {
+      Stream<cpu> *s = ctx.get_stream<cpu>();
+      if (inputs[quantized_elemwise_mul::kLhs].type_flag_ == mshadow::kInt8 &&
+          inputs[quantized_elemwise_mul::kRhs].type_flag_ == mshadow::kInt8) {
+        mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(
+            s, 1, &cached_output_min_, &cached_output_max_, &lhs_min, &lhs_max, &rhs_min, &rhs_max);
+      } else {
+        LOG(ERROR) << "lhs and rhs only support iny8 dtype.";
+      }
+    }
+  } else {
+    auto lhs_scale = kInt8Range / MaxAbs(lhs_min, lhs_max);
+    auto rhs_scale = kInt8Range / MaxAbs(rhs_min, rhs_max);
+    out_scale = 1.0 / lhs_scale / rhs_scale;
+  }
+
+  size_t out_size = outputs[quantized_elemwise_mul::kOut].Size();
+  auto *input_l = inputs[quantized_elemwise_mul::kLhs].dptr<int8_t>();
+  auto *input_r = inputs[quantized_elemwise_mul::kRhs].dptr<int8_t>();
+  // TODO(Xinyu): a temp solution to enable Elemwise INT8 computation,
+  // will be refactored after the DNNL primitive is done.
+  if (!params.enable_float_output) {
+    if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+      typedef int8_t out_type;
+      auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+  #pragma omp simd
+      for (size_t i = 0; i < out_size; ++i) {
+        const int8_t a = input_l[i];
+        const int8_t b = input_r[i];
+        out_data[i] = static_cast<out_type>(a * b * out_scale);
+      }
+    } else {
+      typedef int32_t out_type;
+      auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+  #pragma omp simd
+      for (size_t i = 0; i < out_size; ++i) {
+        const int8_t a = input_l[i];
+        const int8_t b = input_r[i];
+        out_data[i] = static_cast<out_type>(a * b * out_scale);
+      }
+    }
+  } else {
+    typedef float_t out_type;
+    auto *out_data = outputs[quantized_elemwise_mul::kOut].dptr<out_type>();
+#pragma omp simd
+    for (size_t i = 0; i < out_size; ++i) {
+      const int8_t a = input_l[i];
+      const int8_t b = input_r[i];
+      out_data[i] = static_cast<out_type>(a * b * out_scale);
+    }
+  }
+
+  if (!params.enable_float_output) {
+    outputs[quantized_elemwise_mul::kOutMin].dptr<float>()[0] = cached_output_min_;
+    outputs[quantized_elemwise_mul::kOutMax].dptr<float>()[0] = cached_output_max_;
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_elemwise_mul)
+.describe(R"code(Multiplies arguments int8 element-wise.
+)code" ADD_FILELINE)
+.set_num_inputs(6)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const QuantizeElemwiseMulParam& params = nnvm::get<QuantizeElemwiseMulParam>(attrs.parsed);
+  return (!params.enable_float_output) ? 3 : 1;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs", "lhs_min", "lhs_max", "rhs_min", "rhs_max"};
+  })
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", QuantizedElemwiseMulOutputNames)
+.set_attr<mxnet::FInferShape>("FInferShape", QuantizedElemwiseMulOpShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedElemwiseMulOpType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedElemwiseMulOpStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", QuantizedElemwiseMulOpForward)
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.add_argument("lhs", "NDArray-or-Symbol", "first input")
+.add_argument("rhs", "NDArray-or-Symbol", "second input")
+.add_argument("lhs_min", "NDArray-or-Symbol", "Minimum value of first input.")
+.add_argument("lhs_max", "NDArray-or-Symbol", "Maximum value of first input.")
+.add_argument("rhs_min", "NDArray-or-Symbol", "Minimum value of second input.")
+.add_argument("rhs_max", "NDArray-or-Symbol", "Maximum value of second input.")
+.set_attr_parser(ParamParser<QuantizeElemwiseMulParam>)
+.add_arguments(QuantizeElemwiseMulParam::__FIELDS__());
+
+NNVM_REGISTER_OP(elemwise_mul)
+.set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+  nnvm::NodePtr node = nnvm::Node::Create();
+  node->attrs.op = Op::Get("_contrib_quantized_elemwise_mul");
+  node->attrs.name = "quantized_" + attrs.name;
+  node->attrs.dict = attrs.dict;
+  if (node->op()->attr_parser != nullptr) {
+    node->op()->attr_parser(&(node->attrs));
+  }
+  return node;
+});
+
+}  // namespace op
+}  // namespace mxnet