Add support for conv1D (fixes w2l) (#5)

* Fix mps_executor_runner build when using cmake * Add CI scripts to run supported executorch networks through MPS (#1) * Add CI scripts to run supported executorch networks through MPS * Fix CI * Fix CI #2 * Don't specialize the executable for the current device (#3) Co-authored-by: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com> * Update CI script to run test_mps (#4) * Update CI script to run test_mps * Update cmdline * Add lint for mps * Update lint script * Update lint script * Fix lint * Fix lint * Fix lint * Fix lint * Fix lint * Fix lint * Add support for conv1D (fixes w2l) * Perf imprv - Map conv2D to depthwiseConv3D * Add support for PyTorch style printing of output tensors * Fix lint * Remove unused headers * Remove unused headers #2 --------- Co-authored-by: Grzegorz George Pawelczak <grzpawelczak@gmail.com>
DenisVieriu97 · Jan 19, 2024 · e1c740d · e1c740d
1 parent bee46d9
commit e1c740d
Show file tree

Hide file tree

Showing 4 changed files with 78 additions and 47 deletions.
diff --git a/backends/apple/mps/mps_preprocess.py b/backends/apple/mps/mps_preprocess.py
@@ -263,17 +263,13 @@ def preprocess(  # noqa: C901
                     from typing import cast
 
                     input_node = cast(torch.fx.Node, node.args[0]).meta["val"]
-                    sizes = input_node.size()
-                    dim0 = sizes[0]
-                    dim1 = sizes[1]
+                    weight_node = cast(torch.fx.Node, node.args[1]).meta["val"]
                     groups = int(node.args[8])
-                    group_in_channels = dim1
-                    group_out_channels = int(dim0 / groups)
 
                     # Convolution is depthwise if groups = input channels and output channel
                     # is a positive multiple of input channels
-                    is_depthwise_conv = (group_in_channels == 1) and (
-                        group_out_channels % group_in_channels == 0
+                    is_depthwise_conv = (groups > 1 and weight_node.size(1) == 1) and (
+                        input_node.dim() >= 4 and weight_node.dim() >= 4
                     )
 
                     if node.args[2] is None:

diff --git a/backends/apple/mps/operations/ConvolutionOps.mm b/backends/apple/mps/operations/ConvolutionOps.mm
@@ -9,47 +9,81 @@
 using namespace torch;
 
 PyMPSGraphTensor*
-MPSGraphModule::conv2D(MPSGraphTensor* primaryTensor, MPSGraphTensor* secondaryTensor,
-                       MPSGraphTensor* biasTensor, IntArrayRef stride,
-                       IntArrayRef padding, IntArrayRef dilation, bool transpose,
-                       IntArrayRef outputPadding, int64_t groups, bool is_depthwise) {
+MPSGraphModule::conv2D(
+  MPSGraphTensor* primaryTensor,
+  MPSGraphTensor* secondaryTensor,
+  MPSGraphTensor* biasTensor,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool transpose,
+  IntArrayRef outputPadding,
+  int64_t groups,
+  bool is_depthwise) {
+  TORCH_CHECK([primaryTensor.shape count] < 5, "ConvTranspose 3D is not supported on MPS delegate");
+  TORCH_CHECK([primaryTensor dataType] == MPSDataTypeFloat32 || [primaryTensor dataType] == MPSDataTypeFloat16, "ConvTranspose 3D is not supported on MPS delegate");
+
+  // Handle 1D convolution.
+  bool isConv1D = ([secondaryTensor.shape count] == 3);
+  if (isConv1D) {
+    primaryTensor = [mpsGraph expandDimsOfTensor:primaryTensor
+                                            axis:2
+                                            name:@"unsqueezeInput"];
+    secondaryTensor = [mpsGraph expandDimsOfTensor:secondaryTensor
+                                              axis:2
+                                              name:@"unsqueezeWeight"];
+    if (stride.size() == 1) {
+      stride = IntArrayRef{1, stride[0]};
+      padding = IntArrayRef{0, padding[0]};
+      dilation = IntArrayRef{1, dilation[0]};
+      outputPadding = IntArrayRef{0, outputPadding[0]};
+    }
+  }
 
   if(is_depthwise){
-    MPSGraphDepthwiseConvolution2DOpDescriptor* desc = [MPSGraphDepthwiseConvolution2DOpDescriptor
-                                    descriptorWithStrideInX:stride[0]
-                                                  strideInY:stride[1]
-                                            dilationRateInX:dilation[0]
-                                            dilationRateInY:dilation[1]
-                                                paddingLeft:padding[1]
-                                               paddingRight:padding[1]
-                                                 paddingTop:padding[0]
-                                              paddingBottom:padding[0]
-                                               paddingStyle:MPSGraphPaddingStyleExplicit
-                                                 dataLayout:MPSGraphTensorNamedDataLayoutNCHW
-                                              weightsLayout:MPSGraphTensorNamedDataLayoutOIHW];
+    MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor =
+          [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
+    depthWiseConv3dDescriptor.strides =
+        @[ @1, [[NSNumber alloc] initWithInteger:stride[0]], [[NSNumber alloc] initWithInteger:stride[1]] ];
+    depthWiseConv3dDescriptor.dilationRates =
+        @[ @1, [[NSNumber alloc] initWithInteger:dilation[0]], [[NSNumber alloc] initWithInteger:dilation[1]] ];
 
-    MPSGraphTensor* depthwiseConv2DTensor = [mpsGraph depthwiseConvolution2DWithSourceTensor:primaryTensor
-                                                                               weightsTensor:secondaryTensor
-                                                                                  descriptor:desc
-                                                                                        name:@"depthwiseConv2D"];
+    depthWiseConv3dDescriptor.paddingStyle = MPSGraphPaddingStyleExplicit;
+    depthWiseConv3dDescriptor.paddingValues = @[
+      @0,
+      @0,
+      [[NSNumber alloc] initWithInteger:padding[0]],
+      [[NSNumber alloc] initWithInteger:padding[0]],
+      [[NSNumber alloc] initWithInteger:padding[1]],
+      [[NSNumber alloc] initWithInteger:padding[1]]
+    ];
+    depthWiseConv3dDescriptor.channelDimensionIndex = -3LL;
+    MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:secondaryTensor
+                                                            dimension:-3
+                                                        withDimension:-4
+                                                                     name:nil];
+    MPSGraphTensor* depthwiseConvTensor = [mpsGraph depthwiseConvolution3DWithSourceTensor:primaryTensor
+                                                                             weightsTensor:weightTransposeTensor
+                                                                                descriptor:depthWiseConv3dDescriptor
+                                                                                      name:nil];
     //Can be a nullptr
     if(biasTensor){
         //Need to add correct dimension to bias to avoid broadcasting issues
         biasTensor = [mpsGraph expandDimsOfTensor:biasTensor
                                           axes:@[@0, @2, @3]
                                           name:nil];
-        depthwiseConv2DTensor = [mpsGraph additionWithPrimaryTensor:depthwiseConv2DTensor
+        depthwiseConvTensor = [mpsGraph additionWithPrimaryTensor:depthwiseConvTensor
                                                     secondaryTensor:biasTensor
                                                                name:@"depthwiseConv2DWithBiasAdd"];
     }
 
-    return depthwiseConv2DTensor;
+    return depthwiseConvTensor;
   } else {
     MPSGraphConvolution2DOpDescriptor* desc = [MPSGraphConvolution2DOpDescriptor
-                                    descriptorWithStrideInX:stride[0]
-                                                  strideInY:stride[1]
-                                            dilationRateInX:dilation[0]
-                                            dilationRateInY:dilation[1]
+                                    descriptorWithStrideInX:stride[1]
+                                                  strideInY:stride[0]
+                                            dilationRateInX:dilation[1]
+                                            dilationRateInY:dilation[0]
                                                      groups:groups
                                                 paddingLeft:padding[1]
                                                paddingRight:padding[1]
@@ -64,7 +98,7 @@
                                                                 descriptor:desc
                                                                       name:@"conv2D"];
 
-    //Can be a nullptr
+    // Can be a nullptr
     if(biasTensor){
         //Need to add correct dimension to bias to avoid broadcasting issues
         biasTensor = [mpsGraph expandDimsOfTensor:biasTensor
@@ -74,6 +108,13 @@
                                            secondaryTensor:biasTensor
                                                       name:@"conv2DWithBiasAdd"];
     }
+
+    if (isConv1D) {
+      conv2DTensor = [mpsGraph squeezeTensor:conv2DTensor
+                                        axis:2
+                                        name:@"squeeze"];
+    }
+
     return conv2DTensor;
   }
 }

diff --git a/examples/apple/mps/executor_runner/mps_executor_runner.mm b/examples/apple/mps/executor_runner/mps_executor_runner.mm
@@ -12,11 +12,8 @@
  * It uses the original bundled input data from the flatbuffer file.
  */
 
-#import <Foundation/Foundation.h>
-#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#import <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
-
 #include <memory>
+#include <iostream>
 
 #include <gflags/gflags.h>
 
@@ -31,6 +28,7 @@
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/runtime.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
 
 #include <chrono>
 using namespace std::chrono;
@@ -440,15 +438,10 @@ MemoryManager memory_manager(
   std::vector<EValue> outputs(method->outputs_size());
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
-  for (EValue& output : outputs) {
-    // TODO(T159700776): This assumes that all outputs are fp32 tensors. Add
-    // support for other EValues and Tensor dtypes, and print tensors in a more
-    // readable way.
-    auto output_tensor = output.toTensor();
-    auto data_output = output_tensor.const_data_ptr<float>();
-    for (size_t j = 0; j < output_tensor.numel(); ++j) {
-      ET_LOG(Info, "%f", data_output[j]);
-    }
+  // Print the first and last 100 elements of long lists of scalars.
+  std::cout << torch::executor::util::evalue_edge_items(100);
+  for (int i = 0; i < outputs.size(); ++i) {
+    std::cout << "Output " << i << ": " << outputs[i] << std::endl;
   }
 
   // Dump the profiling data to the specified file.

diff --git a/examples/apple/mps/executor_runner/targets.bzl b/examples/apple/mps/executor_runner/targets.bzl
@@ -22,6 +22,7 @@ def define_common_targets():
             deps = [
                 "//executorch/backends/apple/mps:mps",
                 "//executorch/runtime/executor:program",
+                "//executorch/extension/evalue_util:print_evalue",
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/kernels/portable:generated_lib_all_ops",
                 "//executorch/extension/data_loader:file_data_loader",