Skip to content

Commit

Permalink
cpu: aarch64: Call stateless ACL API from winograd convolution
Browse files Browse the repository at this point in the history
- Requires ACL v24.08.
- Update doc as necessary.
  • Loading branch information
theComputeKid authored and vpirogov committed Aug 22, 2024
1 parent 2480cb3 commit 03db3e4
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 161 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ On a CPU based on Arm AArch64 architecture, oneDNN CPU engine can be built with
machine learning applications and provides AArch64 optimized implementations
of core functions. This functionality currently requires that ACL is downloaded
and built separately. See [Build from Source] section of the Developer Guide for
details. oneDNN only supports Compute Library versions 24.07 or later.
details. oneDNN only supports Compute Library versions 24.08 or later.

[Arm Compute Library (ACL)]: https://github.com/arm-software/ComputeLibrary

Expand Down
2 changes: 1 addition & 1 deletion cmake/ACL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ endif()

find_package(ACL REQUIRED)

set(ACL_MINIMUM_VERSION "24.07")
set(ACL_MINIMUM_VERSION "24.08")

if(ACL_FOUND)
file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)
Expand Down
4 changes: 4 additions & 0 deletions src/common/memory_tracking.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ enum {
key_conv_gemm_zp_src_comp,
key_conv_int_dat_in_acc_dt,
key_conv_padded_bias,
key_conv_permuted_inputs,
key_conv_permuted_outputs,
key_conv_permuted_weights,
key_conv_rtus_space,
key_conv_store_wsp,
Expand Down Expand Up @@ -300,9 +302,11 @@ enum {
key_softmax_interim_store,
key_sum_reduction,
key_sum_srcs_cvt,
key_wino_transformed_weights,
key_wino_U,
key_wino_V,
key_wino_M,
key_wino_workspace,
// These two keys should always be the last ones,
// even though they are not in alphabetical order
key_nested,
Expand Down
51 changes: 1 addition & 50 deletions src/cpu/aarch64/acl_convolution_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*******************************************************************************/

#include "cpu/aarch64/acl_convolution_utils.hpp"
#include "acl_convolution_utils.hpp"
#include "common/convolution_pd.hpp"
#include "common/utils.hpp"
#include "oneapi/dnnl/dnnl.h"
Expand Down Expand Up @@ -284,55 +284,6 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
return status::success;
}

status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr) {

// Under these conditions, fallback to faster GEMM-based convolution
// unless the user explicitly specifies Winograd algorithm
// clang-format off
if (one_of(true, src_md.dims[2] > 112, // ih
src_md.dims[3] > 112, // iw
src_md.dims[1] < 64, // ic
dst_md.dims[1] < 64, // oc
dnnl_get_max_threads() > 28)
&& cd.alg_kind == alg_kind::convolution_auto) {
return status::unimplemented;
}
// clang-format on

// General Compute Library checks, memory tags are also set there
acp.alg_winograd = true;
CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));

const bool shape_ok
// only unit strides allowed
= (acp.padstride_info.stride() == std::pair<uint, uint> {1, 1})
// Note: Compute Library supports arbitrary padding for wino kernels
// but we only allow small padding to be consistent with oneDNN
&& (acp.padstride_info.pad().first <= 1) // padding left/right
&& (acp.padstride_info.pad().second <= 1) // padding top/bottom
// only non-dilated convolutions allowed
&& (acp.dilation_info == arm_compute::Size2D(1, 1));

ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");

// clang-format off
// Validate convolution manually to check for return status
ACL_CHECK_VALID(arm_compute::NEWinogradConvolutionLayer::validate(
&acp.src_tensor_info,
&acp.wei_tensor_info,
acp.with_bias ? &acp.bia_tensor_info : nullptr,
&acp.dst_tensor_info,
acp.padstride_info,
acp.act_info,
true)); // enable_fast_math flag in ACL Winograd
// clang-format on

return status::success;
}

status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
Expand Down
130 changes: 115 additions & 15 deletions src/cpu/aarch64/acl_winograd_convolution.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2023 Arm Ltd. and affiliates
* Copyright 2020-2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,29 +14,129 @@
* limitations under the License.
*******************************************************************************/

#include "cpu/aarch64/acl_winograd_convolution.hpp"
#include "acl_winograd_convolution.hpp"
#include "common/memory_tracking.hpp"
#include "common/utils.hpp"

namespace dnnl {
namespace impl {
namespace cpu {
namespace aarch64 {

namespace {
// Keys are anonymous. So deduce the type automagically.
using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);

// Map: [slot , key]
const std::map<int, conv_key_t> wino_conv_keys
= {{0, conv_key_t::key_gemm_asm_tmp_buffer},
{1, conv_key_t::key_gemm_pretranspose_b},
{2, conv_key_t::key_gemm_pretranspose},
{3, conv_key_t::key_gemm_interleaved_lhs},
{4, conv_key_t::key_gemm_pretransposed_rhs},
{5, conv_key_t::key_gemm_transposed_1xwrhs},
{6, conv_key_t::key_gemm_tmp_buffer},
{7, conv_key_t::key_conv_permuted_outputs},
{8, conv_key_t::key_conv_permuted_inputs},
{9, conv_key_t::key_wino_workspace},
{10, conv_key_t::key_wino_transformed_weights},
{11, conv_key_t::key_conv_permuted_weights}};
} // namespace

status_t acl_wino_convolution_fwd_t::pd_t::init(engine_t *engine) {
using namespace data_type;
const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
&& attr()->has_default_values(
primitive_attr_t::skip_mask_t::post_ops, f16);
const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
&& attr()->has_default_values(
primitive_attr_t::skip_mask_t::post_ops, f32);
bool ok = is_fwd()
&& utils::one_of(desc()->alg_kind, alg_kind::convolution_auto,
alg_kind::convolution_winograd)
&& utils::one_of(true, is_fp16_ok, is_fp32_ok)
&& !has_zero_dim_memory();

ok = ok && DNNL_CPU_THREADING_RUNTIME != DNNL_RUNTIME_THREADPOOL;
if (!ok) return status::unimplemented;

CHECK(init_conf());

set_default_alg_kind(alg_kind::convolution_winograd);

Op conv;
conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info, acp_.padstride_info, acp_.act_info,
true); // to support 5x5, 7x7 filter shapes in addition to 3x3

auto scratchpad = scratchpad_registry().registrar();
const auto aux_mem = conv.workspace();
return init_scratchpad(conv, scratchpad, wino_conv_keys, engine, post_ops,
attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_);
}

status_t acl_wino_convolution_fwd_t::create_resource(
engine_t *engine, resource_mapper_t &mapper) const {
CHECK(pd()->post_ops.create_resource(engine, mapper));
return status::success;
}

status_t acl_wino_convolution_fwd_t::init(engine_t *engine) {
auto acp = pd()->acp_;
acl_obj_->conv.configure(&acp.src_tensor_info, &acp.wei_tensor_info,
acp.with_bias ? &acp.bia_tensor_info : nullptr,
&acp.dst_tensor_info, acp.padstride_info, acp.act_info,
true); // to support 5x5, 7x7 filter shapes in addition to 3x3

acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
return status::success;
}

status_t acl_wino_convolution_fwd_t::pd_t::init_conf() {

// Under these conditions, fallback to faster GEMM-based convolution
// unless the user explicitly specifies Winograd algorithm
if (utils::one_of(true, src_md_.dims[2] > 112, // ih
src_md_.dims[3] > 112, // iw
src_md_.dims[1] < 64, // ic
dst_md_.dims[1]<64, // oc
dnnl_get_max_threads()> 28)
&& desc()->alg_kind == alg_kind::convolution_auto) {
return status::unimplemented;
}

// General Compute Library checks, memory tags are also set there
acp_.alg_winograd = true;
CHECK(acl_convolution_utils::acl_init_conf(
acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));

const bool shape_ok
// only unit strides allowed
= (acp_.padstride_info.stride() == std::pair<uint, uint> {1, 1})
// Note: Compute Library supports arbitrary padding for wino kernels
// but we only allow small padding to be consistent with oneDNN
&& (acp_.padstride_info.pad().first <= 1) // padding left/right
&& (acp_.padstride_info.pad().second <= 1) // padding top/bottom
// only non-dilated convolutions allowed
&& (acp_.dilation_info == arm_compute::Size2D(1, 1));

ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");

// Validate convolution manually to check for return status
ACL_CHECK_VALID(Op::validate(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info, acp_.padstride_info, acp_.act_info,
true)); // enable_fast_math flag in ACL Winograd

return status::success;
}

status_t acl_wino_convolution_fwd_t::execute_forward(
const exec_ctx_t &ctx) const {
// Lock here is needed because resource_mapper does not support
// concurrent multithreaded access.
std::lock_guard<std::mutex> _lock {this->mtx};
// Retrieve primitive resource and configured Compute Library objects
auto *acl_resource
= ctx.get_resource_mapper()->get<acl_wino_resource_t>(this);
acl_obj_t<arm_compute::NEWinogradConvolutionLayer> &acl_wino_obj
= acl_resource->get_acl_obj();

return execute_forward_conv_acl<
acl_obj_t<arm_compute::NEWinogradConvolutionLayer>, pd_t, data_t>(
ctx, acl_wino_obj, pd());
return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, data_t>(
ctx, acl_obj_.get(), pd(), wino_conv_keys);
}

} // namespace aarch64
} // namespace cpu
} // namespace impl
Expand Down
107 changes: 13 additions & 94 deletions src/cpu/aarch64/acl_winograd_convolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,53 +19,17 @@

#include "cpu/cpu_convolution_pd.hpp"

#include "cpu/aarch64/acl_convolution_utils.hpp"
#include "acl_convolution_utils.hpp"
#include "arm_compute/runtime/experimental/operators/CpuWinogradConv2d.h"

namespace dnnl {
namespace impl {
namespace cpu {
namespace aarch64 {

struct acl_wino_resource_t : public resource_t {
acl_wino_resource_t()
: acl_wino_obj_(utils::make_unique<
acl_obj_t<arm_compute::NEWinogradConvolutionLayer>>()) {}

status_t configure(const acl_conv_conf_t &acp) {
if (!acl_wino_obj_) return status::out_of_memory;

// Init Compute Library tensors based on info from descriptor
acl_wino_obj_->src_tensor.allocator()->init(acp.src_tensor_info);
acl_wino_obj_->wei_tensor.allocator()->init(acp.wei_tensor_info);
acl_wino_obj_->dst_tensor.allocator()->init(acp.dst_tensor_info);
acl_wino_obj_->bia_tensor.allocator()->init(acp.bia_tensor_info);

// clang-format off
acl_wino_obj_->conv.configure(
&acl_wino_obj_->src_tensor,
&acl_wino_obj_->wei_tensor,
acp.with_bias ? &acl_wino_obj_->bia_tensor : nullptr,
&acl_wino_obj_->dst_tensor,
acp.padstride_info,
acp.act_info,
true); // to support 5x5, 7x7 filter shapes in addition to 3x3
// clang-format on

return status::success;
}

acl_obj_t<arm_compute::NEWinogradConvolutionLayer> &get_acl_obj() const {
return *acl_wino_obj_;
}

DNNL_DISALLOW_COPY_AND_ASSIGN(acl_wino_resource_t);

private:
std::unique_ptr<acl_obj_t<arm_compute::NEWinogradConvolutionLayer>>
acl_wino_obj_;
}; // acl_wino_resource_t

struct acl_wino_convolution_fwd_t : public primitive_t {
using Op = arm_compute::experimental::op::CpuWinogradConv2d;

struct pd_t : public cpu_convolution_fwd_pd_t {
pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
const typename pd_t::base_class *hint_fwd_pd)
Expand All @@ -76,66 +40,22 @@ struct acl_wino_convolution_fwd_t : public primitive_t {
DECLARE_COMMON_PD_T(
"wino:acl", acl_wino_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);

status_t init(engine_t *engine) {
using namespace data_type;
const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
&& attr()->has_default_values(
primitive_attr_t::skip_mask_t::post_ops, f16);
const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
&& attr()->has_default_values(
primitive_attr_t::skip_mask_t::post_ops, f32);
bool ok = is_fwd()
&& utils::one_of(desc()->alg_kind,
alg_kind::convolution_auto,
alg_kind::convolution_winograd)
&& utils::one_of(true, is_fp16_ok, is_fp32_ok)
&& !has_zero_dim_memory();

ok = ok && DNNL_CPU_THREADING_RUNTIME != DNNL_RUNTIME_THREADPOOL;
if (!ok) return status::unimplemented;

CHECK(acl_convolution_utils::init_conf_wino(acp_, src_md_,
weights_md_, dst_md_, bias_md_, *desc(), *attr()));

set_default_alg_kind(alg_kind::convolution_winograd);

CHECK(post_ops.init(
engine, attr_.post_ops_, dst_md_, acp_.act_info));
acp_.use_dst_acc_for_sum = post_ops.has_sum();

if (acp_.use_dst_acc_for_sum) {
const memory_desc_wrapper dst_d(&dst_md_);
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_generic_acc,
dst_d.nelems(), dst_d.data_type_size());
}

return status::success;
}
status_t init(engine_t *engine);

acl_conv_conf_t acp_;
acl_post_ops_t post_ops;

private:
status_t init_conf();
};

acl_wino_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
acl_wino_convolution_fwd_t(const pd_t *apd)
: primitive_t(apd), acl_obj_(std::make_unique<acl_obj_t<Op>>()) {}

status_t create_resource(
engine_t *engine, resource_mapper_t &mapper) const override {
if (mapper.has_resource(this)) return status::success;

auto r = utils::make_unique<acl_wino_resource_t>();
if (!r) return status::out_of_memory;

// Configure the resource based on information from primitive descriptor
CHECK(r->configure(pd()->acp_));
mapper.add(this, std::move(r));

CHECK(pd()->post_ops.create_resource(engine, mapper));

return status::success;
}
engine_t *engine, resource_mapper_t &mapper) const override;

~acl_wino_convolution_fwd_t() {}
status_t init(engine_t *engine) override;

typedef typename prec_traits<data_type::f32>::type data_t;

Expand All @@ -144,10 +64,9 @@ struct acl_wino_convolution_fwd_t : public primitive_t {
}

private:
// To guard the const execute_forward(), the mutex must be 'mutable'
mutable std::mutex mtx;
status_t execute_forward(const exec_ctx_t &ctx) const;
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
std::unique_ptr<acl_obj_t<Op>> acl_obj_;
}; // acl_wino_convolution_fwd_t

} // namespace aarch64
Expand Down

0 comments on commit 03db3e4

Please sign in to comment.