From dc6ed1a5967c644b03874fd1f8a503f0b80be6bd Mon Sep 17 00:00:00 2001
From: XiaobingZhang <xiaobing.zhang@intel.com>
Date: Sun, 6 Mar 2022 22:55:11 +0800
Subject: [PATCH] enable Conv+LeakyRelu fusion (#589)

---
 .../csrc/jit/cpu/kernels/ConvPacked.cpp       | 10 +++
 .../csrc/jit/cpu/kernels/ConvPacked.h         |  5 ++
 .../jit/cpu/passes/graph_rewrite_conv.cpp     | 25 +++++-
 .../jit/cpu/passes/register_dnnl_jit_ops.cpp  | 83 ++++++++++++++-----
 tests/cpu/test_jit.py                         | 33 +++++++-
 5 files changed, 133 insertions(+), 23 deletions(-)
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.cpp
index c7a11c157..c97ae9c37 100644
--- a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.cpp
+++ b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.cpp
@@ -59,6 +59,16 @@ at::Tensor convolution_relu_run(
   return op_context->run(input, ideep::attr_t::fuse_relu());
 }
 
+at::Tensor convolution_leaky_relu_run(
+    const at::Tensor& input,
+    at::Scalar alpha,
+    const c10::intrusive_ptr<ConvolutionOpContext>& op_context) {
+  IPEX_RECORD_FUNCTION(
+      "ipex_prepack::convolution_leaky_relu_run", std::vector<c10::IValue>({}));
+  auto alpha_value = alpha.to<float>();
+  return op_context->run(input, ideep::attr_t::fuse_relu(1.0, alpha_value));
+}
+
 at::Tensor convolution_sigmoid_run(
     const at::Tensor& input,
     const c10::intrusive_ptr<ConvolutionOpContext>& op_context) {
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.h b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.h
index 22ec24872..367440963 100644
--- a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.h
+++ b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.h
@@ -32,6 +32,11 @@ at::Tensor convolution_relu_run(
     const at::Tensor& input,
     const c10::intrusive_ptr<ConvolutionOpContext>& op_context);
 
+at::Tensor convolution_leaky_relu_run(
+    const at::Tensor& input,
+    at::Scalar alpha,
+    const c10::intrusive_ptr<ConvolutionOpContext>& op_context);
+
 at::Tensor convolution_sigmoid_run(
     const at::Tensor& input,
     const c10::intrusive_ptr<ConvolutionOpContext>& op_context);
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_conv.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_conv.cpp
index 2329f6af2..b304104cb 100644
--- a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_conv.cpp
+++ b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_conv.cpp
@@ -106,13 +106,15 @@ void insertPrePackedConvOp(std::shared_ptr<Graph>& graph) {
 
 void fuseConvWithEltwise(std::shared_ptr<Graph>& graph) {
   SubgraphRewriter rewriter_relu, rewriter_sigmoid, rewriter_hardtanh,
-      rewriter_elu, rewriter_swish, rewriter_silu;
+      rewriter_elu, rewriter_swish, rewriter_silu, rewriter_leaky_relu;
   std::array<std::string, 2> relu_operators = {"relu", "relu_"};
   std::array<std::string, 2> sigmoid_operators = {"sigmoid", "sigmoid_"};
   std::array<std::string, 2> hardtanh_operators = {"hardtanh", "hardtanh_"};
   std::array<std::string, 2> elu_operators = {"elu", "elu_"};
   std::array<std::string, 2> mul_operators = {"mul", "mul_"};
   std::array<std::string, 2> silu_operators = {"silu", "silu_"};
+  std::array<std::string, 2> leaky_relu_operators = {
+      "leaky_relu", "leaky_relu_"};
 
   auto conv_relu_rstring = CodeTemplate(R"(
     graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %kernel_size:int[], %groups:int, %output_channel:int, %weight_is_channels_last:bool, %weight_is_prepacked:bool, %input_size:int[]):
@@ -187,6 +189,19 @@ void fuseConvWithEltwise(std::shared_ptr<Graph>& graph) {
         %res = ipex_prepack::convolution_swish_run(%input, %packed_weight)
         return (%res))";
 
+  auto conv_leaky_relu_rstring = CodeTemplate(R"(
+    graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %kernel_size:int[], %groups:int, %output_channel:int, %weight_is_channels_last:bool, %weight_is_prepacked:bool, %input_size:int[], %alpha):
+        %packed_weight : __torch__.torch.classes.ipex_prepack.ConvolutionOpContext = ipex_prepack::convolution_prepack(%weight, %bias, %stride, %padding, %dilation, %kernel_size, %groups, %output_channel, %weight_is_channels_last, %weight_is_prepacked,  %input_size)
+        %x = ipex_prepack::convolution_run(%input, %packed_weight)
+        %res = aten::${leaky_relu}(%x, %alpha)
+        return (%res))");
+
+  std::string conv_leaky_relu_fused = R"(
+    graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %kernel_size:int[], %groups:int, %output_channel:int, %weight_is_channels_last:bool, %weight_is_prepacked:bool, %input_size:int[], %alpha):
+        %packed_weight : __torch__.torch.classes.ipex_prepack.ConvolutionOpContext = ipex_prepack::convolution_leaky_relu_prepack(%weight, %bias, %stride, %padding, %dilation, %kernel_size, %groups, %output_channel, %weight_is_channels_last, %weight_is_prepacked, %input_size, %alpha)
+        %res = ipex_prepack::convolution_leaky_relu_run(%input, %alpha, %packed_weight)
+        return (%res))";
+
   for (const auto& relu : relu_operators) {
     TemplateEnv env;
     env.s("relu", relu);
@@ -238,12 +253,20 @@ void fuseConvWithEltwise(std::shared_ptr<Graph>& graph) {
         return no_input_scale;
       };
 
+  for (const auto& leaky_relu : leaky_relu_operators) {
+    TemplateEnv env;
+    env.s("leaky_relu", leaky_relu);
+    rewriter_leaky_relu.RegisterRewritePattern(
+        conv_leaky_relu_rstring.format(env), conv_leaky_relu_fused);
+  }
+
   rewriter_relu.runOnGraph(graph);
   rewriter_sigmoid.runOnGraph(graph);
   rewriter_hardtanh.runOnGraph(graph);
   rewriter_elu.runOnGraph(graph, filter_conv2d_elu);
   rewriter_swish.runOnGraph(graph);
   rewriter_silu.runOnGraph(graph);
+  rewriter_leaky_relu.runOnGraph(graph);
 }
 
 void fuseConvAddRelu(std::shared_ptr<Graph>& graph) {
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp
index 19cad82c2..04c06eb28 100644
--- a/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp
+++ b/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp
@@ -182,25 +182,6 @@ RegisterOperators op({
           };
         },
         aliasAnalysisFromSchema()),
-    Operator(
-        "ipex_prepack::convolution_hardtanh_run(Tensor input, Scalar "
-        "lower_bound, Scalar upper_bound, "
-        "__torch__.torch.classes.ipex_prepack.ConvolutionOpContext "
-        "W_prepack) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [](Stack* stack) {
-            auto result = convolution_hardtanh_run(
-                (std::move(peek(stack, 0, 4))).toTensor(),
-                (std::move(peek(stack, 1, 4))).toScalar(),
-                (std::move(peek(stack, 2, 4))).toScalar(),
-                (std::move(peek(stack, 3, 4)))
-                    .toCustomClass<ConvolutionOpContext>());
-            drop(stack, 4);
-            pack(stack, std::move(result));
-            return 0;
-          };
-        },
-        aliasAnalysisFromSchema()),
     Operator(
         "ipex_prepack::convolution_elu_prepack(" CONV_PREPACK_ARGS
         ", Scalar alpha, Scalar scale, Scalar input_scale) "
@@ -233,6 +214,52 @@ RegisterOperators op({
           };
         },
         aliasAnalysisFromSchema()),
+    Operator(
+        "ipex_prepack::convolution_leaky_relu_prepack(" CONV_PREPACK_ARGS
+        ", Scalar alpha) "
+        "-> __torch__.torch.classes.ipex_prepack.ConvolutionOpContext",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto alpha_value =
+                (std::move(peek(stack, 11, 12))).toScalar().to<float>();
+            auto result = IpexConvolutionOpContext::create_context(
+                std::move((std::move(peek(stack, 0, 12))).toTensor()),
+                std::move(toOptionalTensor(std::move(peek(stack, 1, 12)))),
+                std::move((std::move(peek(stack, 2, 12))).toIntVector()),
+                std::move((std::move(peek(stack, 3, 12))).toIntVector()),
+                std::move((std::move(peek(stack, 4, 12))).toIntVector()),
+                std::move((std::move(peek(stack, 5, 12))).toIntVector()),
+                (std::move(peek(stack, 6, 12))).toInt(),
+                (std::move(peek(stack, 7, 12))).toInt(),
+                (std::move(peek(stack, 8, 12))).toBool(),
+                (std::move(peek(stack, 9, 12))).toBool(),
+                std::move((std::move(peek(stack, 10, 12))).toIntVector()),
+                ideep::attr_t::fuse_relu(1.0, alpha_value));
+            drop(stack, 12);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "ipex_prepack::convolution_hardtanh_run(Tensor input, Scalar "
+        "lower_bound, Scalar upper_bound, "
+        "__torch__.torch.classes.ipex_prepack.ConvolutionOpContext "
+        "W_prepack) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto result = convolution_hardtanh_run(
+                (std::move(peek(stack, 0, 4))).toTensor(),
+                (std::move(peek(stack, 1, 4))).toScalar(),
+                (std::move(peek(stack, 2, 4))).toScalar(),
+                (std::move(peek(stack, 3, 4)))
+                    .toCustomClass<ConvolutionOpContext>());
+            drop(stack, 4);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
     Operator(
         "ipex_prepack::convolution_elu_run(Tensor input, Scalar alpha, "
         "Scalar scale, Scalar input_scale, "
@@ -253,6 +280,24 @@ RegisterOperators op({
           };
         },
         aliasAnalysisFromSchema()),
+    Operator(
+        "ipex_prepack::convolution_leaky_relu_run(Tensor input, Scalar alpha, "
+        "__torch__.torch.classes.ipex_prepack.ConvolutionOpContext "
+        "W_prepack) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto result = convolution_leaky_relu_run(
+                (std::move(peek(stack, 0, 3))).toTensor(),
+                (std::move(peek(stack, 1, 3))).toScalar(),
+                (std::move(peek(stack, 2, 3)))
+                    .toCustomClass<ConvolutionOpContext>());
+            drop(stack, 3);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
+
     Operator(
         "ipex_prepack::convolution_bottleneck_run(Tensor(a!) input, "
         "__torch__.torch.classes.ipex_prepack.ConvolutionOpContext W_prepack1, "
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
index e8525fc91..28da16b49 100644
--- a/tests/cpu/test_jit.py
+++ b/tests/cpu/test_jit.py
@@ -154,6 +154,20 @@ def __init__(self, dim, in_channels, out_channels, **kwargs):
     def forward(self, x):
         return F.relu(self.conv(x), inplace=True)
 
+class ConvLeakyRelu_Fixed(nn.Module):
+    def __init__(self, dim, in_channels, out_channels, **kwargs):
+        super(ConvLeakyRelu_Fixed, self).__init__()
+        seed = 2018
+        torch.manual_seed(seed)
+        self.conv = conv_module[dim](in_channels, out_channels, bias=False, **kwargs)
+        self.leaky_relu = nn.LeakyReLU(0.1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.leaky_relu(x)
+        return x
+
+
 class Conv_Relu_Add(nn.Module):
     def __init__(self, dim, in_channels, out_channels, **kwargs):
         super(Conv_Relu_Add, self).__init__()
@@ -717,7 +731,7 @@ def forward(self, x):
 
 class LinearSwishNaive(nn.Module):
     def __init__(self, in_feature, out_feature):
-        super(LinearSwishNaive, self).__init__() 
+        super(LinearSwishNaive, self).__init__()
         self.linear = nn.Linear(in_feature, out_feature)
         self.sigmoid = nn.Sigmoid()
 
@@ -1824,11 +1838,24 @@ def test_output_conv_relu(self):
             self._test_output(
                 ConvRelu_Fixed(dim, in_channels, out_channels, kernel_size=kernel_size, stride=1),
                 x,
-                kind_in_graph="ipex_prepack::convolution_relu_run")
+                kind_in_graph="ipex_prepack::convolution_relu_run",
+                kind_not_in_graph="ipex_prepack::convolution_relu_prepack")
             self._test_output_bf16(
                 ConvRelu_Fixed(dim, in_channels, out_channels, kernel_size=kernel_size, stride=1),
                 x,
                 kind_in_graph="ipex_prepack::convolution_relu_run",
+                kind_not_in_graph="ipex_prepack::convolution_relu_prepack",
+                prec=0.02)
+            self._test_output(
+                ConvLeakyRelu_Fixed(dim, in_channels, out_channels, kernel_size=kernel_size, stride=1),
+                x,
+                kind_in_graph="ipex_prepack::convolution_leaky_relu_run",
+                kind_not_in_graph="ipex_prepack::convolution_leaky_relu_prepack")
+            self._test_output_bf16(
+                ConvLeakyRelu_Fixed(dim, in_channels, out_channels, kernel_size=kernel_size, stride=1),
+                x,
+                kind_in_graph="ipex_prepack::convolution_leaky_relu_run",
+                kind_not_in_graph="ipex_prepack::convolution_leaky_relu_prepack",
                 prec=0.02)
 
     def test_output_conv_sum(self):
@@ -2323,7 +2350,7 @@ def _test_onednn_fp32(model, input, kind_in_graph, prec=5e-3):
                 res_jit = tr_model(input)
                 self.assertEqual(res_ref, res_jit)
                 self.assertTrue(any(n.kind() == kind_in_graph for n in trace_graph.nodes()))
-                
+
         _test_onednn_fp32(
             LinearSwish_v1(3, 32, bias=True),
             torch.rand(32, 3),