microsoft · yufenglee · Mar 17, 2022 · Mar 17, 2022
diff --git a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
@@ -967,35 +967,41 @@ static void PermuteInput(api::GraphRef& graph, api::NodeRef& node, size_t i, con
   node.SetInput(i, gather_output);
 }
 
-static bool HandleResize(HandlerArgs& args) {
-  auto inputs = args.node.Inputs();
-  int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());
-
-  if (args.ctx.opset < 11) {
-    PermuteInput(args.ctx.graph, args.node, 1, args.perm_inv);
-  } else {
-    if (inputs[1] != "") {
-      std::vector<int64_t> double_perm_inv = args.perm_inv;
-      double_perm_inv.reserve(2 * args.perm_inv.size());
-      for (int64_t p : args.perm_inv) {
-        double_perm_inv.push_back(p + rank_int);
-      }
-      PermuteInput(args.ctx.graph, args.node, 1, double_perm_inv);
-    }
-    for (size_t i = 2; i < inputs.size(); ++i) {
-      if (inputs[i] != "") {
-        PermuteInput(args.ctx.graph, args.node, i, args.perm_inv);
-      }
-    }
-  }
-
-  TransposeFirstInput(args.ctx, args.node, args.perm_inv);
-  TransposeOutputs(args.ctx, args.node, args.perm);
-
-  return true;
-}
+// static bool HandleResize(HandlerArgs& args) {
+//  auto inputs = args.node.Inputs();
+//  int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());
+//
+//  auto p = ChannelFirstToLastPerm(rank_int);
+//  auto& perm = p == args.perm ? args.perm : args.perm_inv;
+//  auto& perm_inv = p == args.perm ? args.perm_inv : args.perm;
+//
+//  if (args.ctx.opset < 11) {
+//     PermuteInput(args.ctx.graph, args.node, 1, perm);
+//   } else {
+//     if (inputs[1] != "") {
+//       std::vector<int64_t> double_perm_inv = perm;
+//       double_perm_inv.reserve(2 * args.perm.size());
+//       for (int64_t p1 : perm) {
+//         double_perm_inv.push_back(p1 + rank_int);
+//       }
+//       PermuteInput(args.ctx.graph, args.node, 1, double_perm_inv);
+//     }
+//     for (size_t i = 2; i < inputs.size(); ++i) {
+//       if (inputs[i] != "") {
+//         PermuteInput(args.ctx.graph, args.node, i, perm);
+//       }
+//     }
+//   }
+//
+//   TransposeFirstInput(args.ctx, args.node, perm);
+//   TransposeOutputs(args.ctx, args.node, perm_inv);
+//
+//   SwapNodeOpTypeAndDomain(args.ctx.graph, args.node, args.node.OpType(), "com.microsoft.nhwc");
+//
+//   return true;
+// }
 
-constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};
+// constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};
 
 static bool HandlePad(HandlerArgs& args) {
   size_t rank = args.perm.size();
@@ -1691,7 +1697,9 @@ static const std::unordered_map<std::string_view, const HandlerInfo&> handler_ma
     {"Split", split_handler},
     {"Shape", shape_handler},
     {"Pad", pad_handler},
-    {"Resize", resize_handler},
+    // Todo: renable resize handler after adding NHWC support in upsample op on cpu
+    // https://github.com/microsoft/onnxruntime/issues/9857
+    //  {"Resize", resize_handler},
     {"ReduceSum", reduce_sum_handler},
 
     {"ReduceLogSum", reduce_op_handler},

diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc
@@ -420,15 +420,13 @@ struct BilinearParams {
 // that amounts to 'Bilinear' Upsampling/Resizing in the sense that it assumes
 // the scale values for the outermost 2 dimensions are 1.
 // This is the common use-case where the 4-D input (batched multi-channel images)
-// is usually of shapes:
-// - [N, C, H, W] and the scales are [1.0, 1.0, height_scale, width_scale]
-// - [N, H, W, C] and the scales are [1.0, height_scale, width_scale, 1.0]
-static BilinearParams SetupUpsampleBilinear(const int64_t input_height,
-                                            const int64_t input_width,
-                                            const int64_t output_height,
-                                            const int64_t output_width,
-                                            const float height_scale,
-                                            const float width_scale,
+// is usually of shape [N, C, H, W] and the scales are [1.0, 1.0, height_scale, width_scale]
+static BilinearParams SetupUpsampleBilinear(int64_t input_height,
+                                            int64_t input_width,
+                                            int64_t output_height,
+                                            int64_t output_width,
+                                            float height_scale,
+                                            float width_scale,
                                             const std::vector<float>& roi,
                                             AllocatorPtr& alloc,
                                             const GetOriginalCoordinateFunc& get_original_coordinate) {
@@ -525,25 +523,26 @@ static BilinearParams SetupUpsampleBilinear(const int64_t input_height,
 }
 
 template <typename T>
-void UpsampleBilinear(const int64_t batch_size,
-                      const int64_t num_channels,
-                      const int64_t input_height,
-                      const int64_t input_width,
-                      const int64_t output_height,
-                      const int64_t output_width,
-                      const float height_scale,
-                      const float width_scale,
+void UpsampleBilinear(int64_t batch_size,
+                      int64_t num_channels,
+                      int64_t input_height,
+                      int64_t input_width,
+                      int64_t output_height,
+                      int64_t output_width,
+                      float height_scale,
+                      float width_scale,
                       const std::vector<float>& roi,
-                      const bool use_extrapolation,
-                      const float extrapolation_value,
-                      const T* const XdataBase,
-                      T* const YdataBase,
+                      bool use_extrapolation,
+                      float extrapolation_value,
+                      const T* XdataBase,
+                      T* YdataBase,
                       AllocatorPtr& alloc,
                       const GetOriginalCoordinateFunc& get_original_coordinate,
                       concurrency::ThreadPool* tp) {
   BilinearParams p = SetupUpsampleBilinear(input_height, input_width, output_height, output_width,
                                            height_scale, width_scale, roi,
                                            alloc, get_original_coordinate);
+
   for (int64_t n = 0; n < batch_size; ++n) {
     concurrency::ThreadPool::TrySimpleParallelFor(
         tp, num_channels,
@@ -1066,65 +1065,22 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
     case UpsampleMode::LINEAR: {
       // Supports 'bilinear' and 'trilinear' sampling only
 
-      //'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1 or
-      // 4-D input with outermost and innermost scales as 1
+      //'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1
       if (dims.size() == 2 || dims.size() == 4) {
         bool is_2D = dims.size() == 2;
 
-        int64_t batch_size;
-        int64_t num_channels;
-        int64_t input_height;
-        int64_t input_width;
-
-        int64_t output_height;
-        int64_t output_width;
-
-        float height_scale;
-        float width_scale;
-
-        if (is_2D) {
-          batch_size = 1;
-          num_channels = 1;
-          input_height = dims[0];
-          input_width = dims[1];
-
-          output_height = output_dims[0];
-          output_width = output_dims[1];
-
-          height_scale = scales[0];
-          width_scale = scales[1];
-        } else {
-          if (scales[1] == 1.0f) {
-            batch_size = dims[0];
-            num_channels = dims[1];
-            input_height = dims[2];
-            input_width = dims[3];
-
-            output_height = output_dims[2];
-            output_width = output_dims[3];
-
-            height_scale = scales[2];
-            width_scale = scales[3];
-          } else {
-            ORT_ENFORCE(scales[3] == 1.0f, "4-D input with innermost scale (usually channel of NHWC) as 1.");
-
-            batch_size = dims[0];
-            num_channels = dims[3];
-            input_height = dims[1];
-            input_width = dims[2];
-
-            output_height = output_dims[1];
-            output_width = output_dims[2];
-
-            height_scale = scales[1];
-            width_scale = scales[2];
-          }
-        }
+        const int64_t batch_size = is_2D ? 1 : dims[0];
+        const int64_t num_channels = is_2D ? 1 : dims[1];
+        const int64_t input_height = is_2D ? dims[0] : dims[2];
+        const int64_t input_width = is_2D ? dims[1] : dims[3];
+
+        const int64_t output_height = is_2D ? output_dims[0] : output_dims[2];
+        const int64_t output_width = is_2D ? output_dims[1] : output_dims[3];
 
         AllocatorPtr alloc;
         ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
         UpsampleBilinear(batch_size, num_channels, input_height, input_width, output_height, output_width,
-                         height_scale, width_scale, roi,
+                         is_2D ? scales[0] : scales[2], is_2D ? scales[1] : scales[3], roi,
                          use_extrapolation_, extrapolation_value_, X->Data<T>(),
                          Y->MutableData<T>(), alloc, get_original_coordinate_,
                          output_height * output_width > 64 ? context->GetOperatorThreadPool() : nullptr);