enable conv+leaky_relu fusion (#648)

* enable conv+leaky_relu fusion * fix test issue Co-authored-by: Wang Weihan <eikan.wang@intel.com>
intel · Mar 31, 2022 · d760313 · d760313
1 parent 7831bbc
commit d760313
Show file tree

Hide file tree

Showing 6 changed files with 137 additions and 24 deletions.
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.cpp
@@ -59,6 +59,16 @@ at::Tensor convolution_relu_run(
   return op_context->run(input, ideep::attr_t::fuse_relu());
 }
 
+at::Tensor convolution_leaky_relu_run(
+    const at::Tensor& input,
+    at::Scalar alpha,
+    const c10::intrusive_ptr<ConvolutionOpContext>& op_context) {
+  IPEX_RECORD_FUNCTION(
+      "ipex_prepack::convolution_leaky_relu_run", std::vector<c10::IValue>({}));
+  auto alpha_value = alpha.to<float>();
+  return op_context->run(input, ideep::attr_t::fuse_relu(1.0, alpha_value));
+}
+
 at::Tensor convolution_sigmoid_run(
     const at::Tensor& input,
     const c10::intrusive_ptr<ConvolutionOpContext>& op_context) {

diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.h b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConvPacked.h
@@ -32,6 +32,11 @@ at::Tensor convolution_relu_run(
     const at::Tensor& input,
     const c10::intrusive_ptr<ConvolutionOpContext>& op_context);
 
+at::Tensor convolution_leaky_relu_run(
+    const at::Tensor& input,
+    at::Scalar alpha,
+    const c10::intrusive_ptr<ConvolutionOpContext>& op_context);
+
 at::Tensor convolution_sigmoid_run(
     const at::Tensor& input,
     const c10::intrusive_ptr<ConvolutionOpContext>& op_context);

diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_conv.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_conv.cpp
@@ -108,13 +108,15 @@ void insertPrePackedConvOp(std::shared_ptr<Graph>& graph) {
 
 void fuseConvWithEltwise(std::shared_ptr<Graph>& graph) {
   SubgraphRewriter rewriter_relu, rewriter_sigmoid, rewriter_hardtanh,
-      rewriter_elu, rewriter_swish, rewriter_silu;
+      rewriter_elu, rewriter_swish, rewriter_silu, rewriter_leaky_relu;
   std::array<std::string, 2> relu_operators = {"relu", "relu_"};
   std::array<std::string, 2> sigmoid_operators = {"sigmoid", "sigmoid_"};
   std::array<std::string, 2> hardtanh_operators = {"hardtanh", "hardtanh_"};
   std::array<std::string, 2> elu_operators = {"elu", "elu_"};
   std::array<std::string, 2> mul_operators = {"mul", "mul_"};
   std::array<std::string, 2> silu_operators = {"silu", "silu_"};
+  std::array<std::string, 2> leaky_relu_operators = {
+      "leaky_relu", "leaky_relu_"};
 
   auto conv_relu_rstring = CodeTemplate(R"(
     graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %kernel_size:int[], %groups:int, %output_channel:int, %weight_is_channels_last:bool, %weight_is_prepacked:bool, %input_size:int[]):
@@ -189,6 +191,19 @@ void fuseConvWithEltwise(std::shared_ptr<Graph>& graph) {
         %res = ipex_prepack::convolution_swish_run(%input, %packed_weight)
         return (%res))";
 
+  auto conv_leaky_relu_rstring = CodeTemplate(R"(
+    graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %kernel_size:int[], %groups:int, %output_channel:int, %weight_is_channels_last:bool, %weight_is_prepacked:bool, %input_size:int[], %alpha):
+        %packed_weight : __torch__.torch.classes.ipex_prepack.ConvolutionOpContext = ipex_prepack::convolution_prepack(%weight, %bias, %stride, %padding, %dilation, %kernel_size, %groups, %output_channel, %weight_is_channels_last, %weight_is_prepacked,  %input_size)
+        %x = ipex_prepack::convolution_run(%input, %packed_weight)
+        %res = aten::${leaky_relu}(%x, %alpha)
+        return (%res))");
+
+  std::string conv_leaky_relu_fused = R"(
+    graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %kernel_size:int[], %groups:int, %output_channel:int, %weight_is_channels_last:bool, %weight_is_prepacked:bool, %input_size:int[], %alpha):
+        %packed_weight : __torch__.torch.classes.ipex_prepack.ConvolutionOpContext = ipex_prepack::convolution_leaky_relu_prepack(%weight, %bias, %stride, %padding, %dilation, %kernel_size, %groups, %output_channel, %weight_is_channels_last, %weight_is_prepacked, %input_size, %alpha)
+        %res = ipex_prepack::convolution_leaky_relu_run(%input, %alpha, %packed_weight)
+        return (%res))";
+
   for (const auto& relu : relu_operators) {
     TemplateEnv env;
     env.s("relu", relu);
@@ -240,12 +255,20 @@ void fuseConvWithEltwise(std::shared_ptr<Graph>& graph) {
         return no_input_scale;
       };
 
+  for (const auto& leaky_relu : leaky_relu_operators) {
+    TemplateEnv env;
+    env.s("leaky_relu", leaky_relu);
+    rewriter_leaky_relu.RegisterRewritePattern(
+        conv_leaky_relu_rstring.format(env), conv_leaky_relu_fused);
+  }
+
   rewriter_relu.runOnGraph(graph);
   rewriter_sigmoid.runOnGraph(graph);
   rewriter_hardtanh.runOnGraph(graph);
   rewriter_elu.runOnGraph(graph, filter_conv2d_elu);
   rewriter_swish.runOnGraph(graph);
   rewriter_silu.runOnGraph(graph);
+  rewriter_leaky_relu.runOnGraph(graph);
 }
 
 void fuseConvAddRelu(std::shared_ptr<Graph>& graph) {

diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/prepack_folding.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/prepack_folding.cpp
@@ -28,10 +28,12 @@ void PrePackingOpsFolder(Block* b) {
         n->kind() ==
             Symbol::fromQualString(
                 "ipex_prepack::convolution_add_relu_prepack") ||
-
         n->kind() == Symbol::fromQualString("ipex_prepack::linear_prepack") ||
         n->kind() ==
-            Symbol::fromQualString("ipex_prepack::conv_transpose2d_prepack"));
+            Symbol::fromQualString("ipex_prepack::conv_transpose2d_prepack") ||
+        n->kind() ==
+            Symbol::fromQualString(
+                "ipex_prepack::convolution_leaky_relu_prepack"));
   };
 
   std::unordered_set<Node*> nodes_to_delete;

diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp
@@ -182,25 +182,6 @@ RegisterOperators op({
           };
         },
         aliasAnalysisFromSchema()),
-    Operator(
-        "ipex_prepack::convolution_hardtanh_run(Tensor input, Scalar "
-        "lower_bound, Scalar upper_bound, "
-        "__torch__.torch.classes.ipex_prepack.ConvolutionOpContext "
-        "W_prepack) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [](Stack* stack) {
-            auto result = convolution_hardtanh_run(
-                (std::move(peek(stack, 0, 4))).toTensor(),
-                (std::move(peek(stack, 1, 4))).toScalar(),
-                (std::move(peek(stack, 2, 4))).toScalar(),
-                (std::move(peek(stack, 3, 4)))
-                    .toCustomClass<ConvolutionOpContext>());
-            drop(stack, 4);
-            pack(stack, std::move(result));
-            return 0;
-          };
-        },
-        aliasAnalysisFromSchema()),
     Operator(
         "ipex_prepack::convolution_elu_prepack(" CONV_PREPACK_ARGS
         ", Scalar alpha, Scalar scale, Scalar input_scale) "
@@ -233,6 +214,52 @@ RegisterOperators op({
           };
         },
         aliasAnalysisFromSchema()),
+    Operator(
+        "ipex_prepack::convolution_leaky_relu_prepack(" CONV_PREPACK_ARGS
+        ", Scalar alpha) "
+        "-> __torch__.torch.classes.ipex_prepack.ConvolutionOpContext",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto alpha_value =
+                (std::move(peek(stack, 11, 12))).toScalar().to<float>();
+            auto result = IpexConvolutionOpContext::create_context(
+                std::move((std::move(peek(stack, 0, 12))).toTensor()),
+                std::move(toOptionalTensor(std::move(peek(stack, 1, 12)))),
+                std::move((std::move(peek(stack, 2, 12))).toIntVector()),
+                std::move((std::move(peek(stack, 3, 12))).toIntVector()),
+                std::move((std::move(peek(stack, 4, 12))).toIntVector()),
+                std::move((std::move(peek(stack, 5, 12))).toIntVector()),
+                (std::move(peek(stack, 6, 12))).toInt(),
+                (std::move(peek(stack, 7, 12))).toInt(),
+                (std::move(peek(stack, 8, 12))).toBool(),
+                (std::move(peek(stack, 9, 12))).toBool(),
+                std::move((std::move(peek(stack, 10, 12))).toIntVector()),
+                ideep::attr_t::fuse_relu(1.0, alpha_value));
+            drop(stack, 12);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "ipex_prepack::convolution_hardtanh_run(Tensor input, Scalar "
+        "lower_bound, Scalar upper_bound, "
+        "__torch__.torch.classes.ipex_prepack.ConvolutionOpContext "
+        "W_prepack) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto result = convolution_hardtanh_run(
+                (std::move(peek(stack, 0, 4))).toTensor(),
+                (std::move(peek(stack, 1, 4))).toScalar(),
+                (std::move(peek(stack, 2, 4))).toScalar(),
+                (std::move(peek(stack, 3, 4)))
+                    .toCustomClass<ConvolutionOpContext>());
+            drop(stack, 4);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
     Operator(
         "ipex_prepack::convolution_elu_run(Tensor input, Scalar alpha, "
         "Scalar scale, Scalar input_scale, "
@@ -253,6 +280,24 @@ RegisterOperators op({
           };
         },
         aliasAnalysisFromSchema()),
+    Operator(
+        "ipex_prepack::convolution_leaky_relu_run(Tensor input, Scalar alpha, "
+        "__torch__.torch.classes.ipex_prepack.ConvolutionOpContext "
+        "W_prepack) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto result = convolution_leaky_relu_run(
+                (std::move(peek(stack, 0, 3))).toTensor(),
+                (std::move(peek(stack, 1, 3))).toScalar(),
+                (std::move(peek(stack, 2, 3)))
+                    .toCustomClass<ConvolutionOpContext>());
+            drop(stack, 3);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
+
     Operator(
         "ipex_prepack::convolution_bottleneck_run(Tensor(a!) input, "
         "__torch__.torch.classes.ipex_prepack.ConvolutionOpContext W_prepack1, "

diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
@@ -156,6 +156,19 @@ def __init__(self, dim, in_channels, out_channels, **kwargs):
     def forward(self, x):
         return F.relu(self.conv(x), inplace=True)
 
+class ConvLeakyRelu_Fixed(nn.Module):
+    def __init__(self, dim, in_channels, out_channels, **kwargs):
+        super(ConvLeakyRelu_Fixed, self).__init__()
+        seed = 2018
+        torch.manual_seed(seed)
+        self.conv = conv_module[dim](in_channels, out_channels, bias=False, **kwargs)
+        self.leaky_relu = nn.LeakyReLU(0.1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.leaky_relu(x)
+        return x
+
 class Conv_Relu_Add(nn.Module):
     def __init__(self, dim, in_channels, out_channels, **kwargs):
         super(Conv_Relu_Add, self).__init__()
@@ -1787,12 +1800,27 @@ def test_output_conv_relu(self):
             self._test_output(
                 ConvRelu_Fixed(dim, in_channels, out_channels, kernel_size=kernel_size, stride=1),
                 x,
-                kind_in_graph="ipex_prepack::convolution_relu_run")
+                kind_in_graph="ipex_prepack::convolution_relu_run",
+                kind_not_in_graph="ipex_prepack::convolution_relu_prepack")
             self._test_output_bf16(
                 ConvRelu_Fixed(dim, in_channels, out_channels, kernel_size=kernel_size, stride=1),
                 x,
                 kind_in_graph="ipex_prepack::convolution_relu_run",
-                prec=0.08)
+                kind_not_in_graph="ipex_prepack::convolution_relu_prepack",
+                prec=0.08,
+                levels=['O1'])
+            self._test_output(
+                ConvLeakyRelu_Fixed(dim, in_channels, out_channels, kernel_size=kernel_size, stride=1),
+                x,
+                kind_in_graph="ipex_prepack::convolution_leaky_relu_run",
+                kind_not_in_graph="ipex_prepack::convolution_leaky_relu_prepack")
+            self._test_output_bf16(
+                ConvLeakyRelu_Fixed(dim, in_channels, out_channels, kernel_size=kernel_size, stride=1),
+                x,
+                kind_in_graph="ipex_prepack::convolution_leaky_relu_run",
+                kind_not_in_graph="ipex_prepack::convolution_leaky_relu_prepack",
+                prec=0.02,
+                levels=['O1'])
 
     def test_output_conv_sum(self):
         batch_size = 8