new Dynamic Runtime Support (pytorch#2224)

Summary: Pull Request resolved: pytorch#2224 Updating XNNPACK's runtime to use the new dynamic shape apis from XNNPACK. This allows us to run models with variable size inputs. Runtime is backwards compatible Reviewed By: GregoryComer Differential Revision: D54471748
mcr229 · Mar 4, 2024 · d85e004 · d85e004
1 parent 9637930
commit d85e004
Show file tree

Hide file tree

Showing 5 changed files with 251 additions and 306 deletions.
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -149,8 +149,8 @@ Error defineTensor(
     ValuePtr value,
     GraphPtr flatbuffer_graph,
     const uint8_t* constant_data_ptr,
-    XNNExecutor* executor,
-    MemoryAllocator* runtime_allocator) {
+    std::vector<uint32_t>& input_ids,
+    std::vector<uint32_t>& output_ids) {
   const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
   const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;
 
@@ -272,7 +272,6 @@ Error defineTensor(
               /*external_id=*/tensor_value->external_id(),
               /*flags=*/tensor_value->flags(),
               /*id_out=*/&float_id);
-          executor->addDynamicQinput(float_id);
 
           // Define dynamic conversion from float to qdint8
           status = xnn_define_convert(
@@ -391,10 +390,13 @@ Error defineTensor(
 
   // map serialized id to newly generated id
   remapped_ids.emplace(std::make_pair(tensor_value->id_out(), id));
-  // Append this external id to the arg list for execute(*args) to extract from
-  // as args[external_id]
-  if (tensor_value->external_id() != XNN_INVALID_VALUE_ID) {
-    executor->append_arg(tensor_value->external_id());
+
+  // Add external ids to either list of input or output ids
+  if (tensor_value->flags() & XNN_VALUE_FLAG_EXTERNAL_INPUT) {
+    input_ids.push_back(tensor_value->external_id());
+  }
+  if (tensor_value->flags() & XNN_VALUE_FLAG_EXTERNAL_OUTPUT) {
+    output_ids.push_back(tensor_value->external_id());
   }
 
   return Error::Ok;
@@ -1594,6 +1596,9 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
   // Invalid ids do not need to be remapped
   remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID);
 
+  // External Ids for inputs and outputs
+  std::vector<uint32_t> input_ids;
+  std::vector<uint32_t> output_ids;
   Error err = Error::Ok;
   for (auto value : *flatbuffer_graph->xvalues()) {
     err = defineTensor(
@@ -1602,8 +1607,8 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
         value,
         flatbuffer_graph,
         constant_data,
-        executor,
-        runtime_allocator);
+        input_ids,
+        output_ids);
 
     if (err != Error::Ok) {
       return err;
@@ -1635,47 +1640,10 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
       "XNN Runtime creation failed with code: %s",
       xnn_status_to_string(status));
 
-  executor->initialize(runtime_ptr); // NOLINT: runtime_ptr is non-null as
-                                     // error is checked above.
-
-  // HACK FOR FC/BC this is only to support old dq_datatype
-  if (executor->qinputs_.size() > 0) {
-    // qinputs_ is only set when using the old dq linear path. At which point
-    // We need to overide the input_ids_ This workse based off the assumption
-    // old dqlinear path will be single node single input delegate
-    for (uint32_t id : executor->qinputs_) {
-      executor->input_ids_.emplace_back(id);
-    }
-  } else {
-    for (auto old_id : *flatbuffer_graph->input_ids()) {
-      executor->input_ids_.emplace_back(remapped_ids.at(old_id));
-    }
-  }
-  // External ids need to be in order for wiring with args
-  std::sort(executor->input_ids_.begin(), executor->input_ids_.end());
-
-  for (auto old_id : *flatbuffer_graph->output_ids()) {
-    executor->output_ids_.emplace_back(remapped_ids.at(old_id));
-  }
-  // External ids need to be in order for wiring with args
-  std::sort(executor->output_ids_.begin(), executor->output_ids_.end());
-
-  if (!executor->qinputs_.empty() && flatbuffer_graph->xnodes()->size() > 0 &&
-      flatbuffer_graph->xnodes()->Get(0)->xnode_union_type() ==
-          fb_xnnpack::XNodeUnion::XNNFullyConnected) {
-#ifdef ENABLE_DYNAMIC_QUANTIZATION
-    // This delegate is for DQLinear which supports dynamic input shapes
-    if (executor->getNumInputs() < 1 || executor->getNumOutputs() != 1) {
-      ET_LOG(
-          Error,
-          "DQLinear should have at least one input and exactly one output");
-      return Error::NotSupported;
-    }
-#else
-    ET_LOG(Error, "DQ Linear is not supported");
-    return Error::NotSupported;
-#endif
-  }
+  err = executor->initialize( // NOLINT: runtime_ptr is non-null
+      runtime_ptr,
+      std::move(input_ids),
+      std::move(output_ids));
 
   return err;
 };

diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -13,26 +13,207 @@ namespace executor {
 namespace xnnpack {
 namespace delegate {
 
-Error XNNExecutor::set_external_input(
-    uint32_t id,
-    Tensor* input,
-    struct XNNShape* shape) {
-  // TODO(T165403530): Test ensure accuracy for int64 --> float32 conversion
-  if (input->scalar_type() == ScalarType::Long) {
-    // Input data type is int64. However, XNNPACK doesn't support
-    // int64. This means that the data needs to be casted to float
-    // In order for XNNPACK to properly use it.
-    const int64_t* data_64 = input->const_data_ptr<int64_t>();
-    float* data_f32 = input->mutable_data_ptr<float>();
-    for (int j = 0; j < input->numel(); j++) {
-      data_f32[j] = data_64[j];
+using Tensor = exec_aten::Tensor;
+using ScalarType = exec_aten::ScalarType;
+using SizesType = exec_aten::SizesType;
+
+/**
+ * Initializes the XNNExecutor with the runtime and given number of
+ * inputs/outputs externals_ is resized to the total number of inputs and
+ * outputs
+ */
+__ET_NODISCARD Error XNNExecutor::initialize(
+    xnn_runtime_t runtime,
+    std::vector<uint32_t>&& input_ids,
+    std::vector<uint32_t>&& output_ids) {
+  runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
+      runtime, xnn_delete_runtime);
+
+  auto error = profiler_.initialize(runtime);
+  if (error != Error::Ok) {
+    ET_LOG(
+        Error,
+        "Failed to start profiling: %u.",
+        static_cast<unsigned int>(error));
+  }
+
+  // Initialize the external values for inputs and outputs
+  // mapping the executorch arg idx to external IDs
+  input_ids_ = std::move(input_ids);
+  std::sort(input_ids_.begin(), input_ids_.begin());
+
+  output_ids_ = std::move(output_ids);
+  std::sort(output_ids_.begin(), output_ids_.begin());
+
+  externals_.resize(input_ids_.size() + output_ids_.size());
+
+  return Error::Ok;
+}
+
+/**
+ * Prepares the args for XNNPACK Runtime.
+ *
+ * Creates an array of xnn_externals_values from the EValues passed in.
+ * Reshapes all the external input tensors, in case any input shapes have
+ * changed. The reshapes the entire runtime, propagating shape information
+ * through the runtime.
+ *
+ * Note: the external ids given to the external tensors in the XNNPACK
+ * runtime correspond to their index in the list of arg passed into
+ * delegate->execute()
+ */
+__ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) {
+  // Create xnn_externals_value from evalue args
+  xnn_status status;
+  for (uint32_t i = 0; i < externals_.size(); ++i) {
+    ET_CHECK_OR_RETURN_ERROR(
+        args[i]->isTensor(),
+        InvalidArgument,
+        "Expected argument to delegate at index %u to be a Tensor, but got %" PRIu32,
+        i,
+        static_cast<uint32_t>(args[i]->tag));
+    if (i < input_ids_.size()) {
+      externals_[i].id = input_ids_[i];
+    } else {
+      externals_[i].id = output_ids_[i - input_ids_.size()];
+    }
+    uint32_t ext_id = externals_[i].id;
+
+    Tensor* tensor = &args[ext_id]->toTensor();
+    externals_[i].data = tensor->mutable_data_ptr<float>();
+
+    // Reshape runtime inputs
+    if (i < input_ids_.size()) {
+      size_t num_dims = tensor->dim();
+      size_t dims[XNN_MAX_TENSOR_DIMS];
+      for (int d = 0; d < num_dims; ++d) {
+        dims[d] = tensor->size(d);
+      }
+      status =
+          xnn_reshape_external_value(runtime_.get(), ext_id, num_dims, dims);
+      ET_CHECK_OR_RETURN_ERROR(
+          status == xnn_status_success,
+          Internal,
+          "Internal Error: Reshape Input Tensor Failed with code: %s",
+          xnn_status_to_string(status));
     }
   }
-  if (input->dim() != shape->num_dims) {
-    ET_LOG(Error, "Input dim mismatch between tensor and shape struct");
+  // // Propagate Input Shape and Memory Plan for increased allocation
+  status = xnn_reshape_runtime(runtime_.get());
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Internal Error: Propagating input shapes failed with code: %s",
+      xnn_status_to_string(status));
+
+  return Error::Ok;
+}
+
+/**
+ * Runs the XNNPACK Runtime.
+ *
+ * We first setup the runtime by feeding the externals_ to runtime setup.
+ * After which we then execute the runtime through invoke_runtime.
+ */
+__ET_NODISCARD Error XNNExecutor::forward(BackendExecutionContext& context) {
+  ET_CHECK_OR_RETURN_ERROR(
+      runtime_ != nullptr,
+      Internal,
+      "XNNPACK Delegate did not compile correctly");
+
+  xnn_status status = xnn_setup_runtime_v2(
+      runtime_.get(), externals_.size(), externals_.data());
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Internal Error: Setting up the runtime failed with code: %s",
+      xnn_status_to_string(status));
+
+  auto error = profiler_.start(context.event_tracer());
+  if (error != Error::Ok) {
+    ET_LOG(
+        Error,
+        "Failed to start profiling: %u.",
+        static_cast<unsigned int>(error));
+  }
+
+  status = xnn_invoke_runtime(runtime_.get());
+
+  error = profiler_.end();
+  if (error != Error::Ok) {
+    ET_LOG(
+        Error,
+        "Failed to end profiling: %u.",
+        static_cast<unsigned int>(error));
+  }
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "XNN Runtime invoke failed with code: %s",
+      xnn_status_to_string(status));
+
+  return Error::Ok;
+}
+
+/**
+ * Prepares the outputs for ExecuTorch
+ *
+ * Resizes the output tensors based on the output shapes returned by
+ * the xnnpack runtime.
+ *
+ * Note: For arg_max pooling, we recast the output index tensor. Since
+ * XNNPACK gives the index tensor to us as int32, we need to convert it
+ * back to int64 for ExecuTorch.
+ */
+__ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const {
+  size_t output_idx_start = input_ids_.size();
+  for (size_t i = output_idx_start; i < externals_.size(); ++i) {
+    uint32_t ext_id = output_ids_[i - output_idx_start];
+    Tensor* out_tensor = &args[ext_id]->toTensor();
+
+    size_t num_dim;
+    size_t dims[XNN_MAX_TENSOR_DIMS];
+
+    // Fetch the updated output shapes from xnnpack runtime
+    xnn_status status =
+        xnn_get_external_value_shape(runtime_.get(), ext_id, &num_dim, dims);
+
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Internal Error: Failed to retrieve graph output shapes");
+
+    // Convert new output shape into SizesType
+    SizesType expected_output_size[kTensorDimensionLimit];
+    for (size_t d = 0; d < num_dim; ++d) {
+      expected_output_size[d] = static_cast<SizesType>(dims[d]);
+    }
+
+    exec_aten::ArrayRef<SizesType> output_size{
+        expected_output_size, static_cast<size_t>(num_dim)};
+
+    ET_LOG(Debug, "Resizing output tensor to a new shape");
+    Error err = resize_tensor(*out_tensor, output_size);
+    if (err != Error::Ok) {
+      ET_LOG(Error, "Failed to resize output tensor for XNNExecutor");
+      return err;
+    }
+
+    // Output datatype is int64. However, XNNPACK doesn't support
+    // int64. This means that the data was put into this tensor
+    // by XNNPACK as int32 and needs to be copied to int64 form
+    if (out_tensor->scalar_type() == ScalarType::Long) {
+      int64_t* data_64 = out_tensor->mutable_data_ptr<int64_t>();
+      const int32_t* data_32 = out_tensor->const_data_ptr<int32_t>();
+      for (size_t j = out_tensor->numel() - 1; j >= 0; --j) {
+        data_64[j] = data_32[j];
+      }
+    }
   }
 
-  externals_.emplace_back(xnn_external_value{id, input->mutable_data_ptr()});
   return Error::Ok;
 }