apache · masahi · May 15, 2023 · Apr 8, 2023 · May 12, 2023 · May 12, 2023
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
@@ -1304,3 +1304,7 @@ def leaky_relu(x, alpha, input_scale, input_zero_point, output_scale, output_zer
         output_scale,
         output_zero_point,
     )
+
+
+def softmax(x, scale, zero_point, output_scale, output_zero_point, axis=-1):
+    return _make.softmax(x, axis, scale, zero_point, output_scale, output_zero_point)
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -633,3 +633,16 @@ def take(expr, type_map):
 
     out = relay.op.take(arg, indices, **expr.attrs)
     return [out, t]
+
+
+@register_fake_quantization_to_integer("nn.softmax")
+def softmax(expr, type_map):
+    """Rewrite a softmax op"""
+    arg = expr.args[0]
+    arg_t = type_map[arg]
+    out_t = type_map[expr]
+
+    out = relay.qnn.op.softmax(
+        arg, arg_t.scale, arg_t.zero_point, out_t.scale, out_t.zero_point, **expr.attrs
+    )
+    return [out, out_t]
diff --git a/src/relay/qnn/op/softmax.cc b/src/relay/qnn/op/softmax.cc
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(x_p, Negative(x_0));
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = LeftShift(x_b, Subtract(const_i32(n), q));
+  const Expr sums = Sum(exps, {axis}, true, false);
+  const Expr output =
+      RightShift(Multiply(Divide(const_i32(1 << m), sums), exps), const_i32(m - (bits - 1)));
+  const Expr requantized =
+      Requantize(output, arg_types[0].as<TensorTypeNode>()->shape,
+                 MakeConstantScalar(DataType::Float(32), 1.f / (1 << (bits - 1))), const_i32(0),
+                 output_scale, output_zero_point, DataType::Int(bits), 0);
+
+  return requantized;
+}
+
+RELAY_REGISTER_OP("qnn.softmax")
+    .describe("Softmax for quantized tensors.")
+    .set_attrs_type<SoftmaxAttrs>()
+    .set_num_inputs(5)
+    .add_argument("data", "Quantized Tensor", "The input data.")
+    .add_argument("scale", "Tensor", "The quantization scale of the input tensor.")
+    .add_argument("zero_point", "Tensor", "The quantization zero_point of the input tensor.")
+    .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.")
+    .add_argument("output_zero_point", "Tensor",
+                  "The quantization zero_point of the output tensor.")
+    .set_support_level(11)
+    .add_type_rel("QSoftmax", QnnSoftmaxRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
+    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnSoftmaxCanonicalize);
+
+TVM_REGISTER_GLOBAL("relay.qnn.op._make.softmax").set_body_typed(MakeQuantizedSoftmax);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
@@ -770,6 +770,10 @@ inline Expr Copy(Expr data) {
   return Call(op, {data}, Attrs(), {});
 }
 
+inline Expr Max(Expr data, Array<Integer> axis, bool keepdims, bool exclude) {
+  return MakeReduce(data, axis, keepdims, exclude, "max");
+}
+
 inline Expr Mean(Expr data, Array<Integer> axis, bool keepdims, bool exclude) {
   return MakeReduce(data, axis, keepdims, exclude, "mean");
 }

diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    shape = [50, 10]
+    x = relay.var("x", shape=shape, dtype="int8")
+
+    x = relay.qnn.op.dequantize(x, relay.const(0.08), relay.const(-48))
+    op = relay.op.nn.softmax(x, axis=1)
+    op = relay.qnn.op.quantize(op, relay.const(0.0039), relay.const(-128), out_dtype="int8")
+    op = relay.qnn.op.dequantize(op, relay.const(0.0039), relay.const(-128))
+
+    x_np = np.random.randint(-128, 127, size=shape, dtype="int8")
+    args = [x_np]
+
+    mod = tvm.IRModule.from_expr(op)
+    mod = tvm.relay.transform.InferType()(mod)
+    mod_int = tvm.relay.transform.FakeQuantizationToInteger(hard_fail=True)(mod)
+    assert not tvm.ir.structural_equal(mod, mod_int)
+
+    result = (
+        relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
+        .evaluate()(*args)
+        .numpy()
+    )
+    result_int = (
+        relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
+        .evaluate()(*args)
+        .numpy()
+    )
+
+    assert np.allclose(result_int, result, atol=0.05)
+
+
 if __name__ == "__main__":
     tvm.testing.main()