From 16d6dd4fdcdcf4c6a51b77764238c2eb97dad5f8 Mon Sep 17 00:00:00 2001 From: Hamza Butt Date: Tue, 27 Aug 2024 14:08:58 +0000 Subject: [PATCH] cpu: aarch64: Enable stateless ACL depthwise convolution - All common conv code is now stateless, so we can delete all legacy code. - Coindentally fixes #2033 by making the weights constant --- src/cpu/aarch64/acl_convolution_utils.cpp | 19 ---- src/cpu/aarch64/acl_convolution_utils.hpp | 90 ++++------------- src/cpu/aarch64/acl_depthwise_convolution.cpp | 75 ++++++++++++-- src/cpu/aarch64/acl_depthwise_convolution.hpp | 99 +++---------------- 4 files changed, 93 insertions(+), 190 deletions(-) diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp index b3ca0f497e7..15437746069 100644 --- a/src/cpu/aarch64/acl_convolution_utils.cpp +++ b/src/cpu/aarch64/acl_convolution_utils.cpp @@ -283,25 +283,6 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md, return status::success; } - -status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md, - memory_desc_t &weights_md, memory_desc_t &dst_md, - memory_desc_t &bias_md, const convolution_desc_t &cd, - const primitive_attr_t &attr) { - if (weights_md.ndims != 5) return status::unimplemented; - - CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr)); - - ACL_CHECK_VALID(arm_compute::NEDepthwiseConvolutionLayer::validate( - &acp.src_tensor_info, &acp.wei_tensor_info, - acp.with_bias ? &acp.bia_tensor_info : nullptr, - &acp.dst_tensor_info, acp.padstride_info, - 1, // depth multiplier default value - acp.act_info, acp.dilation_info)); - - return status::success; -} - } // namespace acl_convolution_utils } // namespace aarch64 diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp index 60293e0a9e1..37a3d6c3d98 100644 --- a/src/cpu/aarch64/acl_convolution_utils.hpp +++ b/src/cpu/aarch64/acl_convolution_utils.hpp @@ -17,23 +17,20 @@ #ifndef CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP #define CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP +#include +#include "acl_post_ops.hpp" +#include "acl_utils.hpp" +#include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h" #include "cpu/cpu_convolution_pd.hpp" - -#include "cpu/aarch64/acl_post_ops.hpp" -#include "cpu/aarch64/acl_utils.hpp" - +#include namespace dnnl { namespace impl { namespace cpu { namespace aarch64 { -template +template struct acl_obj_t { - NEConv conv; - arm_compute::Tensor src_tensor; - arm_compute::Tensor wei_tensor; - arm_compute::Tensor bia_tensor; - arm_compute::Tensor dst_tensor; + ConvOp conv; arm_compute::experimental::MemoryRequirements aux_mem_req; }; @@ -51,6 +48,7 @@ struct acl_conv_conf_t { arm_compute::TensorInfo wei_tensor_info; arm_compute::TensorInfo bia_tensor_info; arm_compute::TensorInfo dst_tensor_info; + arm_compute::PadStrideInfo padstride_info; arm_compute::Size2D dilation_info; // Additional information about the weights not included in wei_tensor_info @@ -66,15 +64,6 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md, memory_desc_t &bias_md, const convolution_desc_t &cd, const primitive_attr_t &attr); -status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md, - memory_desc_t &weights_md, memory_desc_t &dst_md, - memory_desc_t &bias_md, const convolution_desc_t &cd, - const primitive_attr_t &attr); - -status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md, - memory_desc_t &weights_md, memory_desc_t &dst_md, - memory_desc_t &bias_md, const convolution_desc_t &cd, - const primitive_attr_t &attr); } // namespace acl_convolution_utils // Keys are anonymous with local linkage. So deduce the type automagically. @@ -127,7 +116,6 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx, arm_compute::Tensor dst_tensor; auto const acp = pd->acp_; - src_tensor.allocator()->init(acp.src_tensor_info); wei_tensor.allocator()->init(acp.wei_tensor_info); dst_tensor.allocator()->init(acp.dst_tensor_info); @@ -151,11 +139,15 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx, const_cast(bia_base)); } - arm_compute::ITensorPack pack - = {{arm_compute::TensorType::ACL_SRC_0, &src_tensor}, - {arm_compute::TensorType::ACL_SRC_1, &wei_tensor}, - {arm_compute::TensorType::ACL_SRC_2, &bia_tensor}, - {arm_compute::TensorType::ACL_DST, &dst_tensor}}; + // Constness of the weight tensor matters for depthwise conv in ACL. + // Otherwise, it will package the weights more often than needed, as + // it will expect the weights to change within the duration of the run + // func. + arm_compute::ITensorPack pack; + pack.add_tensor(arm_compute::TensorType::ACL_SRC_0, &src_tensor); + pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_1, &wei_tensor); + pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_2, &bia_tensor); + pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst_tensor); // Get temp workspaces. const auto aux_mem = acl_conv_obj->aux_mem_req; @@ -175,7 +167,6 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx, } } - acl_conv_obj->conv.prepare(pack); acl_conv_obj->conv.run(pack); void *dst = dst_tensor.buffer(); @@ -184,53 +175,6 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx, return status::success; } -template -status_t execute_forward_conv_acl( - const exec_ctx_t &ctx, conv_obj_t &acl_conv_obj, const conv_pd_t *pd) { - bool with_bias = pd->acp_.with_bias; - bool use_dst_acc_for_sum = pd->acp_.use_dst_acc_for_sum; - - auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC); - auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS); - - // import_memory() and free() methods do not allocate/free any additional - // memory, only acquire/release pointers. - acl_conv_obj.src_tensor.allocator()->import_memory( - const_cast(src_base)); - acl_conv_obj.wei_tensor.allocator()->import_memory( - const_cast(wei_base)); - - const auto scratchpad = ctx.get_scratchpad_grantor(); - - // If we have an unfused sum post op, put the result in a scratchpad tensor. - // Result will be summed to the dst during acl_post_ops.execute - auto dst_base = use_dst_acc_for_sum - ? scratchpad.get(memory_tracking::names::key_generic_acc) - : CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST); - acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base); - - if (with_bias) { - auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS); - acl_conv_obj.bia_tensor.allocator()->import_memory( - const_cast(bia_base)); - } - - acl_conv_obj.conv.run(); - - acl_conv_obj.src_tensor.allocator()->free(); - acl_conv_obj.wei_tensor.allocator()->free(); - if (with_bias) { acl_conv_obj.bia_tensor.allocator()->free(); } - - void *dst = acl_conv_obj.dst_tensor.buffer(); - pd->post_ops.execute(ctx, dst); - - acl_conv_obj.dst_tensor.allocator()->free(); - - return status::success; -} - } // namespace aarch64 } // namespace cpu } // namespace impl diff --git a/src/cpu/aarch64/acl_depthwise_convolution.cpp b/src/cpu/aarch64/acl_depthwise_convolution.cpp index 70ae6bceeab..0bf397edc49 100644 --- a/src/cpu/aarch64/acl_depthwise_convolution.cpp +++ b/src/cpu/aarch64/acl_depthwise_convolution.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Arm Ltd. and affiliates +* Copyright 2023-2024 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,21 +21,76 @@ namespace impl { namespace cpu { namespace aarch64 { +namespace { +// Keys are anonymous. So deduce the type automagically. +using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer); + +// Map: [slot , key] +const std::map depthwise_conv_keys + = {{0, conv_key_t::key_gemm_tmp_buffer}, + {1, conv_key_t::key_conv_permuted_weights}}; +} // namespace + status_t acl_depthwise_convolution_fwd_t::execute_forward( const exec_ctx_t &ctx) const { - std::lock_guard _lock {this->mtx}; + return execute_forward_conv_acl, pd_t, data_t>( + ctx, acl_obj_.get(), pd(), depthwise_conv_keys); +} + +status_t acl_depthwise_convolution_fwd_t::pd_t::init(engine_t *engine) { + using namespace data_type; + + const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef) + && attr()->has_default_values( + primitive_attr_t::skip_mask_t::post_ops, f16); + const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef) + && attr()->has_default_values( + primitive_attr_t::skip_mask_t::post_ops, f32); + bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct) + && utils::one_of(true, is_fp16_ok, is_fp32_ok) + && !has_zero_dim_memory(); + if (!ok) return status::unimplemented; + + if (weights_md_.ndims != 5) return status::unimplemented; - auto *acl_resource - = ctx.get_resource_mapper() - ->get(this); - acl_obj_t &acl_depthwise_obj - = acl_resource->get_acl_obj(); + CHECK(acl_convolution_utils::acl_init_conf( + acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr())); - return execute_forward_conv_acl< - acl_obj_t, pd_t, data_t>( - ctx, acl_depthwise_obj, pd()); + ACL_CHECK_VALID(Op::validate(&acp_.src_tensor_info, &acp_.wei_tensor_info, + acp_.with_bias ? &acp_.bia_tensor_info : nullptr, + &acp_.dst_tensor_info, acp_.padstride_info, + 1, // depth multiplier default value + acp_.act_info, acp_.dilation_info)); + + Op conv; + conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info, + acp_.with_bias ? &acp_.bia_tensor_info : nullptr, + &acp_.dst_tensor_info, acp_.padstride_info, + 1, // depth multiplier default value + acp_.act_info, acp_.dilation_info); + + auto scratchpad = scratchpad_registry().registrar(); + return init_scratchpad(conv, scratchpad, depthwise_conv_keys, engine, + post_ops, attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, + dst_md_); } +status_t acl_depthwise_convolution_fwd_t::create_resource( + engine_t *engine, resource_mapper_t &mapper) const { + CHECK(pd()->post_ops.create_resource(engine, mapper)); + return status::success; +} + +status_t acl_depthwise_convolution_fwd_t::init(engine_t *engine) { + auto acp_ = pd()->acp_; + acl_obj_->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info, + acp_.with_bias ? &acp_.bia_tensor_info : nullptr, + &acp_.dst_tensor_info, acp_.padstride_info, + 1, // depth multiplier default value + acp_.act_info, acp_.dilation_info); + acl_obj_->aux_mem_req = acl_obj_->conv.workspace(); + return status::success; +} } // namespace aarch64 } // namespace cpu } // namespace impl diff --git a/src/cpu/aarch64/acl_depthwise_convolution.hpp b/src/cpu/aarch64/acl_depthwise_convolution.hpp index 4df41a4ef1a..c8503af15ff 100644 --- a/src/cpu/aarch64/acl_depthwise_convolution.hpp +++ b/src/cpu/aarch64/acl_depthwise_convolution.hpp @@ -17,7 +17,8 @@ #ifndef CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP #define CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP -#include "cpu/aarch64/acl_convolution_utils.hpp" +#include "acl_convolution_utils.hpp" +#include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h" #include "cpu/cpu_convolution_pd.hpp" namespace dnnl { @@ -25,47 +26,10 @@ namespace impl { namespace cpu { namespace aarch64 { -struct acl_depthwise_convolution_resource_t : public resource_t { - acl_depthwise_convolution_resource_t() - : acl_obj_(utils::make_unique< - acl_obj_t>()) {} - - status_t configure(const acl_conv_conf_t &acp) { - if (!acl_obj_) return status::out_of_memory; - - acl_obj_->src_tensor.allocator()->init(acp.src_tensor_info); - acl_obj_->wei_tensor.allocator()->init(acp.wei_tensor_info); - acl_obj_->dst_tensor.allocator()->init(acp.dst_tensor_info); - acl_obj_->bia_tensor.allocator()->init(acp.bia_tensor_info); - - // clang-format off - acl_obj_->conv.configure( - &acl_obj_->src_tensor, - &acl_obj_->wei_tensor, - acp.with_bias ? &acl_obj_->bia_tensor : nullptr, - &acl_obj_->dst_tensor, - acp.padstride_info, - 1, // depth multiplier default value - acp.act_info, - acp.dilation_info); - - // clang-format on - return status::success; - } - - acl_obj_t &get_acl_obj() const { - return *acl_obj_; - } - - DNNL_DISALLOW_COPY_AND_ASSIGN(acl_depthwise_convolution_resource_t); - -private: - std::unique_ptr> - acl_obj_; -}; - struct acl_depthwise_convolution_fwd_t : public primitive_t { + using Op = arm_compute::experimental::op::CpuDepthwiseConv2d; + struct pd_t : public cpu_convolution_fwd_pd_t { pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr, const typename pd_t::base_class *hint_fwd_pd) @@ -74,59 +38,17 @@ struct acl_depthwise_convolution_fwd_t : public primitive_t { DECLARE_COMMON_PD_T("depthwise_convolution:acl", acl_depthwise_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD); - status_t init(engine_t *engine) { - using namespace data_type; - - const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef) - && attr()->has_default_values( - primitive_attr_t::skip_mask_t::post_ops, f16); - const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef) - && attr()->has_default_values( - primitive_attr_t::skip_mask_t::post_ops, f32); - bool ok = is_fwd() - && set_default_alg_kind(alg_kind::convolution_direct) - && utils::one_of(true, is_fp16_ok, is_fp32_ok) - && !has_zero_dim_memory(); - if (!ok) return status::unimplemented; - - CHECK(acl_convolution_utils::init_conf_depthwise(acp_, src_md_, - weights_md_, dst_md_, bias_md_, *desc(), *attr())); - - CHECK(post_ops.init( - engine, attr_.post_ops_, dst_md_, acp_.act_info)); - acp_.use_dst_acc_for_sum = post_ops.has_sum(); - - if (acp_.use_dst_acc_for_sum) { - const memory_desc_wrapper dst_d(&dst_md_); - auto scratchpad = scratchpad_registry().registrar(); - scratchpad.book(memory_tracking::names::key_generic_acc, - dst_d.nelems(), dst_d.data_type_size()); - } - - return status::success; - } + status_t init(engine_t *engine); acl_conv_conf_t acp_; - acl_post_ops_t post_ops; }; - acl_depthwise_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {} + acl_depthwise_convolution_fwd_t(const pd_t *apd) + : primitive_t(apd), acl_obj_(std::make_unique>()) {} status_t create_resource( - engine_t *engine, resource_mapper_t &mapper) const override { - if (mapper.has_resource(this)) return status::success; - - auto r = utils::make_unique(); - if (!r) return status::out_of_memory; - - CHECK(r->configure(pd()->acp_)); - mapper.add(this, std::move(r)); - - CHECK(pd()->post_ops.create_resource(engine, mapper)); - - return status::success; - } + engine_t *engine, resource_mapper_t &mapper) const override; typedef typename prec_traits::type data_t; @@ -134,11 +56,12 @@ struct acl_depthwise_convolution_fwd_t : public primitive_t { return execute_forward(ctx); } + status_t init(engine_t *engine) override; + private: - mutable std::mutex mtx; status_t execute_forward(const exec_ctx_t &ctx) const; - const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + std::unique_ptr> acl_obj_; }; } // namespace aarch64