Skip to content

Commit

Permalink
[GPU] Fix dynamic loop's not matched issue during multiple shapes are…
Browse files Browse the repository at this point in the history
… inferenced

- Disable memory reuse for updated shape in case of dep of loop
  • Loading branch information
kelvinchoi-intel committed Feb 23, 2024
1 parent c0908b3 commit e0ce33d
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ struct network {
evt = get_primitive_event(output_id);
return network_output(evt, get_output_memory(output_id), get_stream_ptr(), get_output_layout(output_id));
}
layout get_input_node_output_layout(const primitive_id& input_id) const;
layout get_node_output_layout(const primitive_id& output_id) const;
memory::ptr get_output_memory(const primitive_id& output_id);
layout get_output_layout(const primitive_id& output_id) const;
Expand Down
6 changes: 4 additions & 2 deletions src/plugins/intel_gpu/src/graph/loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,16 +375,18 @@ loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(
if (extern_mem_ptr != nullptr) {
layout sliced_layout = intern_prim->get_output_layout(internal_id.idx);
auto inter_mem_ptr = intern_prim->output_memory_ptr(internal_id.idx);
if (inter_mem_ptr == nullptr) {
if (inter_mem_ptr == nullptr || shape_changed()) {
// if inner body intern_prim has no output memory because it has dynamic shape,
// calculate inner body intern_prim layout using concat_mem's layout.
auto updated_sliced_layout = sliced_layout.get_partial_shape();
OPENVINO_ASSERT(updated_sliced_layout[io_prim_map.axis].is_static() || num_iterations > 0,
"Not allowed dynamic dimension for axis when num_iteraiont is negative");

auto origin_input_pshape = body_network->get_input_node_output_layout(internal_id.pid).get_partial_shape();
auto concat_pshape = extern_prim->get_output_layout().get_partial_shape();
const auto shape_size = concat_pshape.size();
for (size_t i = 0; i < shape_size; i++) {
if (updated_sliced_layout[i].is_dynamic()) {
if (origin_input_pshape[i].is_dynamic()) {
updated_sliced_layout[i] = concat_pshape[i];
}
}
Expand Down
9 changes: 9 additions & 0 deletions src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,15 @@ std::string network::get_implementation_info(const primitive_id& id) const {
return _program->get_implementation_info(id);
}

layout network::get_input_node_output_layout(const primitive_id& input_id) const {
auto res = std::find_if(_inputs.begin(), _inputs.end(), [&](const std::shared_ptr<primitive_inst>& v) {
return v->id() == input_id;
});
OPENVINO_ASSERT(res != _inputs.end(), "[GPU] Couldn't get input layout for ", input_id, ". Input with such name is not found in the inputs list");

return (*res)->get_node_output_layout();
}

layout network::get_node_output_layout(const primitive_id& output_id) const {
auto res = std::find_if(_outputs.begin(), _outputs.end(), [&](const std::shared_ptr<primitive_inst>& v) {
return v->id() == output_id;
Expand Down
14 changes: 13 additions & 1 deletion src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,8 +537,20 @@ event::ptr primitive_inst::realloc_if_needed() {
}
}

bool has_loop_user = false;
for (const auto& user : _node->get_users()) {
if (user->is_type<loop>()) {
has_loop_user = true;
}
}

// Clear out memory if if was previously reused, but now primitive can't be optimized
if (_node->is_type<gather>() || _node->is_type<permute>() || _node->is_type<reshape>() || _node->is_type<reorder>() || _node->is_type<strided_slice>()) {
if (!has_loop_user &&
(_node->is_type<gather>() ||
_node->is_type<permute>() ||
_node->is_type<reshape>() ||
_node->is_type<reorder>() ||
_node->is_type<strided_slice>())) {
if (can_be_optimized()) {
_max_output_layout_count = _deps[0].first->_max_output_layout_count;
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized");
Expand Down
167 changes: 167 additions & 0 deletions src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,3 +601,170 @@ TEST(loop_gpu, support_dynamic_tensoriterator_outer_axis) {

test_loop_gpu_wo_trip_count({ 2, 1, 1, 2}, { 2, 5, 1, 2}, input_data_5_4, output_data_5_4, 1, 4);
}

static void test_loop_gpu_wo_trip_count_w_multiple_shapes(ov::PartialShape body_input_layout,
std::vector<ov::PartialShape> whole_layouts,
std::vector<std::vector<float>> input_data_list,
std::vector<float> expected_output_data,
size_t axis,
size_t exit_value,
bool is_caching_test = false) {
auto& engine = get_test_engine();

auto b_input_layout = cldnn::layout{ body_input_layout, data_types::f32, format::bfyx };

ov::PartialShape sliced_input_shape = body_input_layout;
sliced_input_shape[axis] = 1;
auto sliced_input_layout = cldnn::layout{ sliced_input_shape, data_types::f32, format::bfyx };

auto const_layout = cldnn::layout{ {}, data_types::i64, format::bfyx };

auto e_initial_condition_mem = engine.allocate_memory(const_layout);
auto e_num_iteration_mem = engine.allocate_memory(const_layout);
auto b_exit_value_mem = engine.allocate_memory(const_layout);
auto b_index_inc_mem = engine.allocate_memory(const_layout);

// initialize input buffers
set_values(e_initial_condition_mem, {1});
set_values(b_exit_value_mem, {exit_value});
set_values(b_index_inc_mem, {1});
set_values(e_num_iteration_mem, {0});

primitive_id body_current_iteration_id = "b_index";
primitive_id body_execution_condition_id = "b_cond_exit_value";

cldnn::topology body(
input_layout(body_current_iteration_id, const_layout),
input_layout("b_add_data", sliced_input_layout),
input_layout("b_mul_data", sliced_input_layout),
data("b_exit_value", b_exit_value_mem),
data("b_index_inc", b_index_inc_mem),
eltwise("b_index_update", input_info(body_current_iteration_id), input_info("b_index_inc"), eltwise_mode::sum),
reorder("b_index_cast", input_info("b_index_update"),
cldnn::format::any, data_types::f32, {}, cldnn::reorder_mean_mode::subtract, cldnn::padding(), true),
eltwise(body_execution_condition_id, input_info("b_index"), input_info("b_exit_value"), eltwise_mode::lt),
eltwise("b_add", input_info("b_add_data"), input_info("b_index_cast"), eltwise_mode::sum),
eltwise("b_mul", input_info("b_mul_data"), input_info("b_index_cast"), eltwise_mode::prod));

primitive_id trip_count_id = "";
primitive_id actual_iteration_count_id = "actual_iteration_count";
primitive_id initial_condition_id = "initial_condition";
int64_t num_iterations = -1;

std::vector<loop::io_primitive_map> input_primitive_maps {
loop::io_primitive_map("input", "b_add_data", axis),
loop::io_primitive_map("input", "b_mul_data", axis),
loop::io_primitive_map(actual_iteration_count_id, body_current_iteration_id) };
std::vector<loop::io_primitive_map> output_primitive_maps {
loop::io_primitive_map(cldnn::input_info("loop", 0), cldnn::input_info("b_add", 0), axis),
loop::io_primitive_map(cldnn::input_info("loop", 1), cldnn::input_info("b_mul", 0), axis) };
std::vector<loop::backedge_mapping> back_edges {
loop::backedge_mapping("b_index_update", body_current_iteration_id) };

auto body_program = build_program(engine, body, body_execution_condition_id, output_primitive_maps, back_edges, true);

cldnn::topology topology(
input_layout("input", b_input_layout),
input_layout(initial_condition_id, e_initial_condition_mem->get_layout()),
mutable_data(actual_iteration_count_id, e_num_iteration_mem),
loop("loop", { input_info(actual_iteration_count_id), input_info(initial_condition_id), input_info("input") }, body_program,
trip_count_id, initial_condition_id, actual_iteration_count_id,
input_primitive_maps, output_primitive_maps, back_edges,
num_iterations, body_current_iteration_id, body_execution_condition_id, 2),
eltwise("out_sum", input_info("loop", 0), input_info("loop", 1), eltwise_mode::sum));

ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));

cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);


for (size_t i = 0 ; i < whole_layouts.size(); i++) {
auto whole_layout = whole_layouts[i];
auto input_data = input_data_list[i];

// initialize input buffers
set_values(e_initial_condition_mem, {1});
set_values(b_exit_value_mem, {exit_value});
set_values(b_index_inc_mem, {1});
set_values(e_num_iteration_mem, {0});

auto e_input_layout = cldnn::layout{ whole_layout, data_types::f32, format::bfyx };
auto e_input_mem = engine.allocate_memory(e_input_layout); // b,f,x,y
auto expected_output_layout = whole_layout;
set_values(e_input_mem, input_data);
network->set_input_data("input", e_input_mem);

network->set_input_data(initial_condition_id, e_initial_condition_mem);

auto outputs = network->execute();
ASSERT_EQ(outputs.size(), 1);

auto expected_num_iterations = (exit_value + 1);
expected_output_layout[axis] = expected_num_iterations;
auto e_output_layout = cldnn::layout{ expected_output_layout, data_types::f32, format::bfyx };

auto num_iter_mem = network->get_output_memory(actual_iteration_count_id);
if (num_iter_mem != nullptr) {
mem_lock<int64_t> num_iter_ptr{ num_iter_mem, get_test_stream() };
ASSERT_EQ(num_iter_ptr.data()[0], expected_num_iterations);
}

std::vector<float> expected(input_data.size());
if (expected_output_data.size() == 0) {
size_t unit = 1;
for (size_t k = axis; k < whole_layout.size(); k++) {
unit *= whole_layout[k].get_length();
}

for (size_t j = 0; j < input_data.size(); j++) {
auto val = static_cast<size_t>((j % unit) / 4) + 1;
expected[j] = static_cast<float>(input_data[j] + val) + static_cast<float>(input_data[j] * val);
}
} else {
expected = expected_output_data;
}

auto output_mem = outputs.begin()->second.get_memory();
auto output_layout = output_mem->get_layout();
ASSERT_EQ(output_layout.batch(), e_output_layout.batch());
ASSERT_EQ(output_layout.feature(), e_output_layout.feature());
ASSERT_EQ(output_layout.spatial(0), e_output_layout.spatial(0));
ASSERT_EQ(output_layout.spatial(1), e_output_layout.spatial(1));
// value check
{
mem_lock<float> output_ptr{ output_mem, get_test_stream() };
for (size_t i = 0, iend = output_layout.count(); i < iend; ++i) {
ASSERT_FLOAT_EQ(output_ptr[i], expected.at(i));
}
}
}
}

std::vector<float> input_data_4_4{
1.0f, 2.0f, -15.f, 3.0f,
4.0f, -15.f, 5.0f, 6.0f,
-15.f, 7.0f, -15.f, 0.0f,
0.0f, -15.f, 0.5f, -0.5f,
};

std::vector<float> input_data_2_4_4{
1.0f, 2.0f, -15.f, 3.0f,
4.0f, -15.f, 5.0f, 6.0f,
-15.f, 7.0f, -15.f, 0.0f,
0.0f, -15.f, 0.5f, -0.5f,

1.0f, 2.0f, -15.f, 3.0f,
4.0f, -15.f, 5.0f, 6.0f,
-15.f, 7.0f, -15.f, 0.0f,
0.0f, -15.f, 0.5f, -0.5f,
};

TEST(loop_gpu, support_loop_w_dynamic_input_w_various_shapes) {
test_loop_gpu_wo_trip_count_w_multiple_shapes(
{ 1, -1, 4, 4 },
{{ 1, 1, 4, 4 }, { 1, 2, 4, 4 }}, // axis value should be iter_num = (exit_value + 1)
{input_data_4_4, input_data_2_4_4},
std::vector<float>(),
2, 3);
}

0 comments on commit e0ce33d

Please sign in to comment.