Skip to content

Commit

Permalink
cpu: aarch64: Enable stateless ACL depthwise convolution
Browse files Browse the repository at this point in the history
- All common conv code is now stateless, so we can delete all legacy code.
- Coindentally fixes oneapi-src#2033 by making the weights constant
  • Loading branch information
theComputeKid authored and vpirogov committed Aug 27, 2024
1 parent c4fc012 commit 16d6dd4
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 190 deletions.
19 changes: 0 additions & 19 deletions src/cpu/aarch64/acl_convolution_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,25 +283,6 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,

return status::success;
}

status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr) {
if (weights_md.ndims != 5) return status::unimplemented;

CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));

ACL_CHECK_VALID(arm_compute::NEDepthwiseConvolutionLayer::validate(
&acp.src_tensor_info, &acp.wei_tensor_info,
acp.with_bias ? &acp.bia_tensor_info : nullptr,
&acp.dst_tensor_info, acp.padstride_info,
1, // depth multiplier default value
acp.act_info, acp.dilation_info));

return status::success;
}

} // namespace acl_convolution_utils

} // namespace aarch64
Expand Down
90 changes: 17 additions & 73 deletions src/cpu/aarch64/acl_convolution_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,20 @@
#ifndef CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP
#define CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP

#include <map>
#include "acl_post_ops.hpp"
#include "acl_utils.hpp"
#include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h"
#include "cpu/cpu_convolution_pd.hpp"

#include "cpu/aarch64/acl_post_ops.hpp"
#include "cpu/aarch64/acl_utils.hpp"

#include <type_traits>
namespace dnnl {
namespace impl {
namespace cpu {
namespace aarch64 {

template <typename NEConv>
template <typename ConvOp>
struct acl_obj_t {
NEConv conv;
arm_compute::Tensor src_tensor;
arm_compute::Tensor wei_tensor;
arm_compute::Tensor bia_tensor;
arm_compute::Tensor dst_tensor;
ConvOp conv;
arm_compute::experimental::MemoryRequirements aux_mem_req;
};

Expand All @@ -51,6 +48,7 @@ struct acl_conv_conf_t {
arm_compute::TensorInfo wei_tensor_info;
arm_compute::TensorInfo bia_tensor_info;
arm_compute::TensorInfo dst_tensor_info;

arm_compute::PadStrideInfo padstride_info;
arm_compute::Size2D dilation_info;
// Additional information about the weights not included in wei_tensor_info
Expand All @@ -66,15 +64,6 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr);

status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr);

status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr);
} // namespace acl_convolution_utils

// Keys are anonymous with local linkage. So deduce the type automagically.
Expand Down Expand Up @@ -127,7 +116,6 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
arm_compute::Tensor dst_tensor;

auto const acp = pd->acp_;

src_tensor.allocator()->init(acp.src_tensor_info);
wei_tensor.allocator()->init(acp.wei_tensor_info);
dst_tensor.allocator()->init(acp.dst_tensor_info);
Expand All @@ -151,11 +139,15 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
const_cast<bia_data_t *>(bia_base));
}

arm_compute::ITensorPack pack
= {{arm_compute::TensorType::ACL_SRC_0, &src_tensor},
{arm_compute::TensorType::ACL_SRC_1, &wei_tensor},
{arm_compute::TensorType::ACL_SRC_2, &bia_tensor},
{arm_compute::TensorType::ACL_DST, &dst_tensor}};
// Constness of the weight tensor matters for depthwise conv in ACL.
// Otherwise, it will package the weights more often than needed, as
// it will expect the weights to change within the duration of the run
// func.
arm_compute::ITensorPack pack;
pack.add_tensor(arm_compute::TensorType::ACL_SRC_0, &src_tensor);
pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_1, &wei_tensor);
pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_2, &bia_tensor);
pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst_tensor);

// Get temp workspaces.
const auto aux_mem = acl_conv_obj->aux_mem_req;
Expand All @@ -175,7 +167,6 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
}
}

acl_conv_obj->conv.prepare(pack);
acl_conv_obj->conv.run(pack);

void *dst = dst_tensor.buffer();
Expand All @@ -184,53 +175,6 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
return status::success;
}

template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
typename bia_data_t = src_data_t>
status_t execute_forward_conv_acl(
const exec_ctx_t &ctx, conv_obj_t &acl_conv_obj, const conv_pd_t *pd) {
bool with_bias = pd->acp_.with_bias;
bool use_dst_acc_for_sum = pd->acp_.use_dst_acc_for_sum;

auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);

// import_memory() and free() methods do not allocate/free any additional
// memory, only acquire/release pointers.
acl_conv_obj.src_tensor.allocator()->import_memory(
const_cast<src_data_t *>(src_base));
acl_conv_obj.wei_tensor.allocator()->import_memory(
const_cast<wei_data_t *>(wei_base));

const auto scratchpad = ctx.get_scratchpad_grantor();

// If we have an unfused sum post op, put the result in a scratchpad tensor.
// Result will be summed to the dst during acl_post_ops.execute
auto dst_base = use_dst_acc_for_sum
? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
: CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base);

if (with_bias) {
auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
acl_conv_obj.bia_tensor.allocator()->import_memory(
const_cast<bia_data_t *>(bia_base));
}

acl_conv_obj.conv.run();

acl_conv_obj.src_tensor.allocator()->free();
acl_conv_obj.wei_tensor.allocator()->free();
if (with_bias) { acl_conv_obj.bia_tensor.allocator()->free(); }

void *dst = acl_conv_obj.dst_tensor.buffer();
pd->post_ops.execute(ctx, dst);

acl_conv_obj.dst_tensor.allocator()->free();

return status::success;
}

} // namespace aarch64
} // namespace cpu
} // namespace impl
Expand Down
75 changes: 65 additions & 10 deletions src/cpu/aarch64/acl_depthwise_convolution.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2023 Arm Ltd. and affiliates
* Copyright 2023-2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,21 +21,76 @@ namespace impl {
namespace cpu {
namespace aarch64 {

namespace {
// Keys are anonymous. So deduce the type automagically.
using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);

// Map: [slot , key]
const std::map<int, conv_key_t> depthwise_conv_keys
= {{0, conv_key_t::key_gemm_tmp_buffer},
{1, conv_key_t::key_conv_permuted_weights}};
} // namespace

status_t acl_depthwise_convolution_fwd_t::execute_forward(
const exec_ctx_t &ctx) const {
std::lock_guard<std::mutex> _lock {this->mtx};
return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, data_t>(
ctx, acl_obj_.get(), pd(), depthwise_conv_keys);
}

status_t acl_depthwise_convolution_fwd_t::pd_t::init(engine_t *engine) {
using namespace data_type;

const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
&& attr()->has_default_values(
primitive_attr_t::skip_mask_t::post_ops, f16);
const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
&& attr()->has_default_values(
primitive_attr_t::skip_mask_t::post_ops, f32);
bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
&& utils::one_of(true, is_fp16_ok, is_fp32_ok)
&& !has_zero_dim_memory();
if (!ok) return status::unimplemented;

if (weights_md_.ndims != 5) return status::unimplemented;

auto *acl_resource
= ctx.get_resource_mapper()
->get<acl_depthwise_convolution_resource_t>(this);
acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer> &acl_depthwise_obj
= acl_resource->get_acl_obj();
CHECK(acl_convolution_utils::acl_init_conf(
acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));

return execute_forward_conv_acl<
acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer>, pd_t, data_t>(
ctx, acl_depthwise_obj, pd());
ACL_CHECK_VALID(Op::validate(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info, acp_.padstride_info,
1, // depth multiplier default value
acp_.act_info, acp_.dilation_info));

Op conv;
conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info, acp_.padstride_info,
1, // depth multiplier default value
acp_.act_info, acp_.dilation_info);

auto scratchpad = scratchpad_registry().registrar();
return init_scratchpad(conv, scratchpad, depthwise_conv_keys, engine,
post_ops, attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum,
dst_md_);
}

status_t acl_depthwise_convolution_fwd_t::create_resource(
engine_t *engine, resource_mapper_t &mapper) const {
CHECK(pd()->post_ops.create_resource(engine, mapper));
return status::success;
}

status_t acl_depthwise_convolution_fwd_t::init(engine_t *engine) {
auto acp_ = pd()->acp_;
acl_obj_->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info, acp_.padstride_info,
1, // depth multiplier default value
acp_.act_info, acp_.dilation_info);
acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
return status::success;
}
} // namespace aarch64
} // namespace cpu
} // namespace impl
Expand Down
Loading

0 comments on commit 16d6dd4

Please sign in to comment.