Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: revert acl_winograd_convolution to stateful #2357

Merged
merged 1 commit into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions src/common/memory_tracking.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*******************************************************************************
* Copyright 2018-2024 Intel Corporation
* Copyright 2024 Arm Ltd. and affiliates
* Copyright 2024-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -205,8 +205,6 @@ enum {
key_conv_ncsp_matmul_dst,
key_conv_ncsp_diff_sp_sum,
key_conv_padded_bias,
key_conv_permuted_inputs,
key_conv_permuted_outputs,
key_conv_permuted_weights,
key_conv_rtus_space,
key_conv_store_wsp,
Expand Down Expand Up @@ -317,11 +315,9 @@ enum {
key_softmax_interim_store,
key_sum_reduction,
key_sum_srcs_cvt,
key_wino_transformed_weights,
key_wino_U,
key_wino_V,
key_wino_M,
key_wino_workspace,
// These two keys should always be the last ones,
// even though they are not in alphabetical order
key_nested,
Expand Down
54 changes: 52 additions & 2 deletions src/cpu/aarch64/acl_convolution_utils.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2024 Arm Ltd. and affiliates
* Copyright 2020-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,7 +14,7 @@
* limitations under the License.
*******************************************************************************/

#include "acl_convolution_utils.hpp"
#include "cpu/aarch64/acl_convolution_utils.hpp"
#include "common/convolution_pd.hpp"
#include "common/utils.hpp"
#include "oneapi/dnnl/dnnl.h"
Expand Down Expand Up @@ -283,6 +283,56 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,

return status::success;
}

status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr) {

// Under these conditions, fallback to faster GEMM-based convolution
// unless the user explicitly specifies Winograd algorithm
// clang-format off
if (one_of(true, src_md.dims[2] > 112, // ih
src_md.dims[3] > 112, // iw
src_md.dims[1] < 64, // ic
dst_md.dims[1] < 64, // oc
dnnl_get_max_threads() > 28)
&& cd.alg_kind == alg_kind::convolution_auto) {
return status::unimplemented;
}
// clang-format on

// General Compute Library checks, memory tags are also set there
acp.alg_winograd = true;
CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));

const bool shape_ok
// only unit strides allowed
= (acp.padstride_info.stride() == std::pair<uint, uint> {1, 1})
// Note: Compute Library supports arbitrary padding for wino kernels
// but we only allow small padding to be consistent with oneDNN
&& (acp.padstride_info.pad().first <= 1) // padding left/right
&& (acp.padstride_info.pad().second <= 1) // padding top/bottom
// only non-dilated convolutions allowed
&& (acp.dilation_info == arm_compute::Size2D(1, 1));

ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");

// clang-format off
// Validate convolution manually to check for return status
ACL_CHECK_VALID(arm_compute::NEWinogradConvolutionLayer::validate(
&acp.src_tensor_info,
&acp.wei_tensor_info,
acp.with_bias ? &acp.bia_tensor_info : nullptr,
&acp.dst_tensor_info,
acp.padstride_info,
acp.act_info,
true)); // enable_fast_math flag in ACL Winograd
// clang-format on

return status::success;
}

} // namespace acl_convolution_utils

} // namespace aarch64
Expand Down
58 changes: 57 additions & 1 deletion src/cpu/aarch64/acl_convolution_utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2024 Arm Ltd. and affiliates
* Copyright 2020-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -30,6 +30,10 @@ namespace aarch64 {

template <typename ConvOp>
struct acl_obj_t {
arm_compute::Tensor src_tensor;
arm_compute::Tensor wei_tensor;
arm_compute::Tensor bia_tensor;
arm_compute::Tensor dst_tensor;
ConvOp conv;
arm_compute::experimental::MemoryRequirements aux_mem_req;
};
Expand Down Expand Up @@ -64,6 +68,11 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr);

status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr);

} // namespace acl_convolution_utils

// Keys are anonymous with local linkage. So deduce the type automagically.
Expand Down Expand Up @@ -175,6 +184,53 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
return status::success;
}

template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
typename bia_data_t = src_data_t>
status_t execute_forward_conv_acl(
const exec_ctx_t &ctx, conv_obj_t &acl_conv_obj, const conv_pd_t *pd) {
bool with_bias = pd->acp_.with_bias;
bool use_dst_acc_for_sum = pd->acp_.use_dst_acc_for_sum;

auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);

// import_memory() and free() methods do not allocate/free any additional
// memory, only acquire/release pointers.
acl_conv_obj.src_tensor.allocator()->import_memory(
const_cast<src_data_t *>(src_base));
acl_conv_obj.wei_tensor.allocator()->import_memory(
const_cast<wei_data_t *>(wei_base));

const auto scratchpad = ctx.get_scratchpad_grantor();

// If we have an unfused sum post op, put the result in a scratchpad tensor.
// Result will be summed to the dst during acl_post_ops.execute
auto dst_base = use_dst_acc_for_sum
? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
: CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base);

if (with_bias) {
auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
acl_conv_obj.bia_tensor.allocator()->import_memory(
const_cast<bia_data_t *>(bia_base));
}

acl_conv_obj.conv.run();

acl_conv_obj.src_tensor.allocator()->free();
acl_conv_obj.wei_tensor.allocator()->free();
if (with_bias) { acl_conv_obj.bia_tensor.allocator()->free(); }

void *dst = acl_conv_obj.dst_tensor.buffer();
pd->post_ops.execute(ctx, dst);

acl_conv_obj.dst_tensor.allocator()->free();

return status::success;
}

} // namespace aarch64
} // namespace cpu
} // namespace impl
Expand Down
32 changes: 5 additions & 27 deletions src/cpu/aarch64/acl_gemm_convolution.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2024 Arm Ltd. and affiliates
* Copyright 2020-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -89,44 +89,22 @@ template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
data_type_t bia_t>
status_t acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t, bia_t>::init(
engine_t *engine) {
// commented due to hot fix solution for stateless API which should be replaced soon.
// auto acp_ = pd()->acp_;
// acl_obj_->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
// acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
// &acp_.dst_tensor_info, acp_.padstride_info, acp_.weights_info,
// acp_.dilation_info, acp_.act_info, acp_.fast_math);
// acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
return status::success;
}

template <data_type_t src_type, data_type_t wei_type, data_type_t dst_type,
data_type_t bia_type>
std::unique_ptr<acl_obj_t<typename acl_gemm_convolution_fwd_t<src_type,
wei_type, dst_type, bia_type>::Op>>
acl_gemm_convolution_fwd_t<src_type, wei_type, dst_type,
bia_type>::reinitialize_acl_obj() const {
auto acp_ = pd()->acp_;
std::unique_ptr<acl_obj_t<Op>> acl_obj = std::make_unique<acl_obj_t<Op>>();
acl_obj->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acl_obj_->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info, acp_.padstride_info, acp_.weights_info,
acp_.dilation_info, acp_.act_info, acp_.fast_math);
acl_obj->aux_mem_req = acl_obj->conv.workspace();
return acl_obj;
acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
return status::success;
}

template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
data_type_t bia_t>
status_t
acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t, bia_t>::execute_forward(
const exec_ctx_t &ctx) const {
// Temporary hotfix: We're using a local acl_obj instance in this method
// instead of the class member acl_obj_. This hotfix is to bypass persistent aux mem requirements but is not the ideal solution.
// It should be refactored or removed in the future when a more permanent fix is implemented.
auto acl_obj = reinitialize_acl_obj();

return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, src_data_t, wei_data_t,
dst_data_t, bia_data_t>(ctx, acl_obj.get(), pd(), gemm_conv_keys);
dst_data_t, bia_data_t>(ctx, acl_obj_.get(), pd(), gemm_conv_keys);
}

using namespace data_type;
Expand Down
17 changes: 4 additions & 13 deletions src/cpu/aarch64/acl_gemm_convolution.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2024 Arm Ltd. and affiliates
* Copyright 2020-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,10 +47,8 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
acl_post_ops_t post_ops;
};

// hot fix solution for stateless API which should be replaced soon.
// acl_gemm_convolution_fwd_t(const pd_t *apd)
// : primitive_t(apd), acl_obj_(std::make_unique<acl_obj_t<Op>>()) {}
acl_gemm_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
acl_gemm_convolution_fwd_t(const pd_t *apd)
: primitive_t(apd), acl_obj_(std::make_unique<acl_obj_t<Op>>()) {}

status_t init(engine_t *engine) override;

Expand All @@ -65,15 +63,8 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {

private:
status_t execute_forward(const exec_ctx_t &ctx) const;

// hot fix solution for stateless API which should be replaced soon.
std::unique_ptr<acl_obj_t<Op>> reinitialize_acl_obj() const;

const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }

// commented due to hot fix solution for stateless API which should be replaced soon.
// std::unique_ptr<acl_obj_t<Op>> acl_obj_;

std::unique_ptr<acl_obj_t<Op>> acl_obj_;
}; // acl_gemm_convolution_fwd_t

} // namespace aarch64
Expand Down
Loading
Loading