Skip to content

Commit

Permalink
new Dynamic Runtime Support (pytorch#2224)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#2224

Updating XNNPACK's runtime to use the new dynamic shape apis from XNNPACK. This allows us to run models with variable size inputs. Runtime is backwards compatible

Reviewed By: GregoryComer

Differential Revision: D54471748
  • Loading branch information
mcr229 authored and facebook-github-bot committed Mar 4, 2024
1 parent 9637930 commit d85e004
Show file tree
Hide file tree
Showing 5 changed files with 251 additions and 306 deletions.
68 changes: 18 additions & 50 deletions backends/xnnpack/runtime/XNNCompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ Error defineTensor(
ValuePtr value,
GraphPtr flatbuffer_graph,
const uint8_t* constant_data_ptr,
XNNExecutor* executor,
MemoryAllocator* runtime_allocator) {
std::vector<uint32_t>& input_ids,
std::vector<uint32_t>& output_ids) {
const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;

Expand Down Expand Up @@ -272,7 +272,6 @@ Error defineTensor(
/*external_id=*/tensor_value->external_id(),
/*flags=*/tensor_value->flags(),
/*id_out=*/&float_id);
executor->addDynamicQinput(float_id);

// Define dynamic conversion from float to qdint8
status = xnn_define_convert(
Expand Down Expand Up @@ -391,10 +390,13 @@ Error defineTensor(

// map serialized id to newly generated id
remapped_ids.emplace(std::make_pair(tensor_value->id_out(), id));
// Append this external id to the arg list for execute(*args) to extract from
// as args[external_id]
if (tensor_value->external_id() != XNN_INVALID_VALUE_ID) {
executor->append_arg(tensor_value->external_id());

// Add external ids to either list of input or output ids
if (tensor_value->flags() & XNN_VALUE_FLAG_EXTERNAL_INPUT) {
input_ids.push_back(tensor_value->external_id());
}
if (tensor_value->flags() & XNN_VALUE_FLAG_EXTERNAL_OUTPUT) {
output_ids.push_back(tensor_value->external_id());
}

return Error::Ok;
Expand Down Expand Up @@ -1594,6 +1596,9 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
// Invalid ids do not need to be remapped
remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID);

// External Ids for inputs and outputs
std::vector<uint32_t> input_ids;
std::vector<uint32_t> output_ids;
Error err = Error::Ok;
for (auto value : *flatbuffer_graph->xvalues()) {
err = defineTensor(
Expand All @@ -1602,8 +1607,8 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
value,
flatbuffer_graph,
constant_data,
executor,
runtime_allocator);
input_ids,
output_ids);

if (err != Error::Ok) {
return err;
Expand Down Expand Up @@ -1635,47 +1640,10 @@ __ET_NODISCARD Error XNNCompiler::compileModel(
"XNN Runtime creation failed with code: %s",
xnn_status_to_string(status));

executor->initialize(runtime_ptr); // NOLINT: runtime_ptr is non-null as
// error is checked above.

// HACK FOR FC/BC this is only to support old dq_datatype
if (executor->qinputs_.size() > 0) {
// qinputs_ is only set when using the old dq linear path. At which point
// We need to overide the input_ids_ This workse based off the assumption
// old dqlinear path will be single node single input delegate
for (uint32_t id : executor->qinputs_) {
executor->input_ids_.emplace_back(id);
}
} else {
for (auto old_id : *flatbuffer_graph->input_ids()) {
executor->input_ids_.emplace_back(remapped_ids.at(old_id));
}
}
// External ids need to be in order for wiring with args
std::sort(executor->input_ids_.begin(), executor->input_ids_.end());

for (auto old_id : *flatbuffer_graph->output_ids()) {
executor->output_ids_.emplace_back(remapped_ids.at(old_id));
}
// External ids need to be in order for wiring with args
std::sort(executor->output_ids_.begin(), executor->output_ids_.end());

if (!executor->qinputs_.empty() && flatbuffer_graph->xnodes()->size() > 0 &&
flatbuffer_graph->xnodes()->Get(0)->xnode_union_type() ==
fb_xnnpack::XNodeUnion::XNNFullyConnected) {
#ifdef ENABLE_DYNAMIC_QUANTIZATION
// This delegate is for DQLinear which supports dynamic input shapes
if (executor->getNumInputs() < 1 || executor->getNumOutputs() != 1) {
ET_LOG(
Error,
"DQLinear should have at least one input and exactly one output");
return Error::NotSupported;
}
#else
ET_LOG(Error, "DQ Linear is not supported");
return Error::NotSupported;
#endif
}
err = executor->initialize( // NOLINT: runtime_ptr is non-null
runtime_ptr,
std::move(input_ids),
std::move(output_ids));

return err;
};
Expand Down
213 changes: 197 additions & 16 deletions backends/xnnpack/runtime/XNNExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,207 @@ namespace executor {
namespace xnnpack {
namespace delegate {

Error XNNExecutor::set_external_input(
uint32_t id,
Tensor* input,
struct XNNShape* shape) {
// TODO(T165403530): Test ensure accuracy for int64 --> float32 conversion
if (input->scalar_type() == ScalarType::Long) {
// Input data type is int64. However, XNNPACK doesn't support
// int64. This means that the data needs to be casted to float
// In order for XNNPACK to properly use it.
const int64_t* data_64 = input->const_data_ptr<int64_t>();
float* data_f32 = input->mutable_data_ptr<float>();
for (int j = 0; j < input->numel(); j++) {
data_f32[j] = data_64[j];
using Tensor = exec_aten::Tensor;
using ScalarType = exec_aten::ScalarType;
using SizesType = exec_aten::SizesType;

/**
* Initializes the XNNExecutor with the runtime and given number of
* inputs/outputs externals_ is resized to the total number of inputs and
* outputs
*/
__ET_NODISCARD Error XNNExecutor::initialize(
xnn_runtime_t runtime,
std::vector<uint32_t>&& input_ids,
std::vector<uint32_t>&& output_ids) {
runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
runtime, xnn_delete_runtime);

auto error = profiler_.initialize(runtime);
if (error != Error::Ok) {
ET_LOG(
Error,
"Failed to start profiling: %u.",
static_cast<unsigned int>(error));
}

// Initialize the external values for inputs and outputs
// mapping the executorch arg idx to external IDs
input_ids_ = std::move(input_ids);
std::sort(input_ids_.begin(), input_ids_.begin());

output_ids_ = std::move(output_ids);
std::sort(output_ids_.begin(), output_ids_.begin());

externals_.resize(input_ids_.size() + output_ids_.size());

return Error::Ok;
}

/**
* Prepares the args for XNNPACK Runtime.
*
* Creates an array of xnn_externals_values from the EValues passed in.
* Reshapes all the external input tensors, in case any input shapes have
* changed. The reshapes the entire runtime, propagating shape information
* through the runtime.
*
* Note: the external ids given to the external tensors in the XNNPACK
* runtime correspond to their index in the list of arg passed into
* delegate->execute()
*/
__ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) {
// Create xnn_externals_value from evalue args
xnn_status status;
for (uint32_t i = 0; i < externals_.size(); ++i) {
ET_CHECK_OR_RETURN_ERROR(
args[i]->isTensor(),
InvalidArgument,
"Expected argument to delegate at index %u to be a Tensor, but got %" PRIu32,
i,
static_cast<uint32_t>(args[i]->tag));
if (i < input_ids_.size()) {
externals_[i].id = input_ids_[i];
} else {
externals_[i].id = output_ids_[i - input_ids_.size()];
}
uint32_t ext_id = externals_[i].id;

Tensor* tensor = &args[ext_id]->toTensor();
externals_[i].data = tensor->mutable_data_ptr<float>();

// Reshape runtime inputs
if (i < input_ids_.size()) {
size_t num_dims = tensor->dim();
size_t dims[XNN_MAX_TENSOR_DIMS];
for (int d = 0; d < num_dims; ++d) {
dims[d] = tensor->size(d);
}
status =
xnn_reshape_external_value(runtime_.get(), ext_id, num_dims, dims);
ET_CHECK_OR_RETURN_ERROR(
status == xnn_status_success,
Internal,
"Internal Error: Reshape Input Tensor Failed with code: %s",
xnn_status_to_string(status));
}
}
if (input->dim() != shape->num_dims) {
ET_LOG(Error, "Input dim mismatch between tensor and shape struct");
// // Propagate Input Shape and Memory Plan for increased allocation
status = xnn_reshape_runtime(runtime_.get());

ET_CHECK_OR_RETURN_ERROR(
status == xnn_status_success,
Internal,
"Internal Error: Propagating input shapes failed with code: %s",
xnn_status_to_string(status));

return Error::Ok;
}

/**
* Runs the XNNPACK Runtime.
*
* We first setup the runtime by feeding the externals_ to runtime setup.
* After which we then execute the runtime through invoke_runtime.
*/
__ET_NODISCARD Error XNNExecutor::forward(BackendExecutionContext& context) {
ET_CHECK_OR_RETURN_ERROR(
runtime_ != nullptr,
Internal,
"XNNPACK Delegate did not compile correctly");

xnn_status status = xnn_setup_runtime_v2(
runtime_.get(), externals_.size(), externals_.data());

ET_CHECK_OR_RETURN_ERROR(
status == xnn_status_success,
Internal,
"Internal Error: Setting up the runtime failed with code: %s",
xnn_status_to_string(status));

auto error = profiler_.start(context.event_tracer());
if (error != Error::Ok) {
ET_LOG(
Error,
"Failed to start profiling: %u.",
static_cast<unsigned int>(error));
}

status = xnn_invoke_runtime(runtime_.get());

error = profiler_.end();
if (error != Error::Ok) {
ET_LOG(
Error,
"Failed to end profiling: %u.",
static_cast<unsigned int>(error));
}

ET_CHECK_OR_RETURN_ERROR(
status == xnn_status_success,
Internal,
"XNN Runtime invoke failed with code: %s",
xnn_status_to_string(status));

return Error::Ok;
}

/**
* Prepares the outputs for ExecuTorch
*
* Resizes the output tensors based on the output shapes returned by
* the xnnpack runtime.
*
* Note: For arg_max pooling, we recast the output index tensor. Since
* XNNPACK gives the index tensor to us as int32, we need to convert it
* back to int64 for ExecuTorch.
*/
__ET_NODISCARD Error XNNExecutor::resize_outputs(EValue** args) const {
size_t output_idx_start = input_ids_.size();
for (size_t i = output_idx_start; i < externals_.size(); ++i) {
uint32_t ext_id = output_ids_[i - output_idx_start];
Tensor* out_tensor = &args[ext_id]->toTensor();

size_t num_dim;
size_t dims[XNN_MAX_TENSOR_DIMS];

// Fetch the updated output shapes from xnnpack runtime
xnn_status status =
xnn_get_external_value_shape(runtime_.get(), ext_id, &num_dim, dims);

ET_CHECK_OR_RETURN_ERROR(
status == xnn_status_success,
Internal,
"Internal Error: Failed to retrieve graph output shapes");

// Convert new output shape into SizesType
SizesType expected_output_size[kTensorDimensionLimit];
for (size_t d = 0; d < num_dim; ++d) {
expected_output_size[d] = static_cast<SizesType>(dims[d]);
}

exec_aten::ArrayRef<SizesType> output_size{
expected_output_size, static_cast<size_t>(num_dim)};

ET_LOG(Debug, "Resizing output tensor to a new shape");
Error err = resize_tensor(*out_tensor, output_size);
if (err != Error::Ok) {
ET_LOG(Error, "Failed to resize output tensor for XNNExecutor");
return err;
}

// Output datatype is int64. However, XNNPACK doesn't support
// int64. This means that the data was put into this tensor
// by XNNPACK as int32 and needs to be copied to int64 form
if (out_tensor->scalar_type() == ScalarType::Long) {
int64_t* data_64 = out_tensor->mutable_data_ptr<int64_t>();
const int32_t* data_32 = out_tensor->const_data_ptr<int32_t>();
for (size_t j = out_tensor->numel() - 1; j >= 0; --j) {
data_64[j] = data_32[j];
}
}
}

externals_.emplace_back(xnn_external_value{id, input->mutable_data_ptr()});
return Error::Ok;
}

Expand Down
Loading

0 comments on commit d85e004

Please sign in to comment.