Fix unexpected runtime skip permute

openvinotoolkit · Oct 31, 2024 · 9e67731 · 9e67731
1 parent c158480
commit 9e67731
Show file tree

Hide file tree

Showing 2 changed files with 117 additions and 27 deletions.
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -1426,36 +1426,47 @@ void primitive_inst::do_runtime_skip_permute() {
     auto desc = _node->as<permute>().get_primitive();
     auto input_shape = _impl_params->get_input_layout(0).get_shape();
     const auto& permute_order = desc->permute_order;
-    // Check runtime shape
-    // Optimize when the largest value among the actual dim values in case where the permute order
-    // is different from the shape index is equal to the multiplied value
-    // Set size to zero to pass the case that permute order is same with input order like [0, 1, 2, 3]
-    int32_t size = 0;
-    int32_t max_value = 0;
-    for (int32_t i = 0; i < static_cast<int32_t>(permute_order.size()); ++i) {
-        int32_t order = static_cast<int32_t>(permute_order[i]);
-        int32_t dim = static_cast<int32_t>(input_shape[order]);
-        if (i != order) {
-            if (dim > max_value)
-                max_value = dim;
-            size = (size == 0) ? dim : (size * dim);
-        }
-    }
-
-    // If the largest value and total size are different, can_be_optimized needs to be reset
-    if (size != max_value) {
-        set_can_be_optimized(false);
-        GPU_DEBUG_TRACE_DETAIL << "--- Cannot optimize because size(" << size << ") and max_value(" << max_value
-                               << ") are different" << std::endl;
-
-        GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_permute] " << id() << " : reset can_be_optimized to false "
-                               << std::endl;
-        return;
+    // Skippability
+    // 1. Check within "congituous transpose range"
+    // [2, 1, 0] => [2, 1, 0]
+    // [1, 0, 2, 3] => [1, 0, 2]
+    // [0, 2, 1, 3] => [2, 1]
+    // [0, 3, 1, 2] => [3, 1, 2]
+    // [2, 0, 1, 3] => [2, 0, 1]
+    // [3, 2, 1, 0] => [3, 2, 1, 0]
+    // [3, 2, 1, 0, 4] => [3, 2, 1, 0]
+    // [0, 2, 1, 3, 5, 4] => [2, 1], [5, 4]
+    // [4, 5, 2, 3, 0, 1] => [4, 5, 2, 3, 0, 1]
+    // 2. Within each transpose range, only 1 non-1 dimension allowed at max.
+    size_t range_max_dim = 0;
+    size_t count_not_one = 0;
+    bool can_skip = true;
+    for (size_t dim = 0; dim < permute_order.size(); ++dim) {
+        auto target_dim = static_cast<size_t>(permute_order[dim]);
+        if (dim == target_dim && range_max_dim <= dim) {
+            // range end => check
+            if (count_not_one > 1) {
+                can_skip = false;
+                break;
+            }
+            count_not_one = 0;
+            range_max_dim = 0;
+            continue;
+        }
+        if (input_shape[dim] > 1)
+            count_not_one++;
+        range_max_dim = std::max(range_max_dim, target_dim);
     }
-    GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_permute] " << id() << " : can_be_optimized" << std::endl;
+    can_skip = (count_not_one <= 1);
+    GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_permute] " << id() << " : can_be_optimized ? " << can_skip << std::endl;
     GPU_DEBUG_TRACE_DETAIL << "            - Input layout : " << _impl_params->get_input_layout(0).to_short_string() << std::endl;
     GPU_DEBUG_TRACE_DETAIL << "            - Output layout : " << _impl_params->get_output_layout().to_short_string() << std::endl;
-    set_can_be_optimized(true);
+    GPU_DEBUG_TRACE_DETAIL << "            - permute order : ";
+    for (auto order : permute_order) {
+        GPU_DEBUG_TRACE_DETAIL << order << ",";
+    }
+    GPU_DEBUG_TRACE_DETAIL << std::endl;
+    set_can_be_optimized(can_skip);
 }
 
 void primitive_inst::do_runtime_skip_strided_slice() {

diff --git a/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_permutes_at_runtime.cpp b/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_permutes_at_runtime.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/permute.hpp>
+#include <intel_gpu/primitives/reorder.hpp>
+#include <intel_gpu/primitives/data.hpp>
+
+#include "permute_inst.h"
+#include "program_wrapper.h"
+
+#include <cmath>
+#include <algorithm>
+
+using namespace cldnn;
+using namespace ::tests;
+
+namespace skip_permute_tests {
+
+struct skip_permute_params {
+    layout input_layout_static;
+    std::vector<uint16_t> permute_order;
+    bool expected_result1;
+    bool expected_result2;
+};
+
+class skip_permute_at_runtime_test : public testing::TestWithParam<skip_permute_params> {};
+
+TEST_P(skip_permute_at_runtime_test, runtime_skip) {
+    auto p = GetParam();
+    auto& engine = get_test_engine();
+    auto rank = p.input_layout_static.get_partial_shape().size();
+    auto input_layout_dynamic = layout {ov::PartialShape::dynamic(rank), data_types::f16, format::get_default_format(rank)};
+    topology topology(input_layout("input", input_layout_dynamic),
+                      permute("permute", input_info("input"), p.permute_order),
+                      reorder("reorder", input_info("permute"), format::get_default_format(rank), data_types::f32));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+
+    network network(engine, topology, config);
+    auto permute_inst = network.get_primitive("permute");
+    ASSERT_EQ(permute_inst->can_be_optimized(), p.expected_result1);
+
+    auto input_mem = engine.allocate_memory(p.input_layout_static);
+    network.set_input_data("input", input_mem);
+    auto outputs = network.execute();
+    outputs.begin()->second.get_memory();
+
+    ASSERT_EQ(permute_inst->can_be_optimized(), p.expected_result2);
+}
+
+INSTANTIATE_TEST_SUITE_P(smoke, skip_permute_at_runtime_test,
+    testing::ValuesIn(std::vector<skip_permute_params> {
+        { layout{ov::PartialShape{8, 2, 8}, data_types::f16, format::bfyx}, {2, 1, 0}, true, false },
+        { layout{ov::PartialShape{8, 2, 1}, data_types::f16, format::bfyx}, {2, 1, 0}, true, false },
+        { layout{ov::PartialShape{1, 12, 1}, data_types::f16, format::bfyx}, {2, 1, 0}, true, true },
+        { layout{ov::PartialShape{2, 3, 1, 14}, data_types::f16, format::bfyx}, {1, 0, 2, 3}, true, false },
+        { layout{ov::PartialShape{1, 3, 1, 14}, data_types::f16, format::bfyx}, {1, 0, 2, 3}, true, true },
+        { layout{ov::PartialShape{12, 3, 1, 14}, data_types::f16, format::bfyx}, {0, 2, 1, 3}, true, true },
+        { layout{ov::PartialShape{12, 3, 2, 14}, data_types::f16, format::bfyx}, {0, 2, 1, 3}, true, false },
+        { layout{ov::PartialShape{12, 1, 1, 14}, data_types::f16, format::bfyx}, {0, 3, 1, 2}, true, true },
+        { layout{ov::PartialShape{12, 1, 1, 14}, data_types::f16, format::bfyx}, {0, 3, 1, 2}, true, true },
+        { layout{ov::PartialShape{1, 1, 1, 14}, data_types::f16, format::bfyx}, {0, 3, 1, 2}, true, true },
+        { layout{ov::PartialShape{1, 3, 2, 14}, data_types::f16, format::bfyx}, {0, 3, 1, 2}, true, false },
+        { layout{ov::PartialShape{1, 1, 4, 14}, data_types::f16, format::bfyx}, {2, 0, 1, 3}, true, true },
+        { layout{ov::PartialShape{1, 4, 4, 1}, data_types::f16, format::bfyx}, {2, 0, 1, 3}, true, false },
+        { layout{ov::PartialShape{1, 10, 1, 1, 11}, data_types::f16, format::bfzyx}, {3, 2, 1, 0, 4}, true, true },
+        { layout{ov::PartialShape{1, 10, 2, 1, 10}, data_types::f16, format::bfzyx}, {3, 2, 1, 0, 4}, true, false },
+        { layout{ov::PartialShape{1, 4, 1, 3, 4, 1}, data_types::f16, format::bfwzyx}, {0, 2, 1, 3, 5, 4}, true, true },
+        { layout{ov::PartialShape{1, 4, 2, 3, 4, 1}, data_types::f16, format::bfwzyx}, {0, 2, 1, 3, 5, 4}, true, false },
+        { layout{ov::PartialShape{1, 1, 1, 1, 4, 1}, data_types::f16, format::bfwzyx}, {4, 5, 2, 3, 0, 1}, true, true },
+        { layout{ov::PartialShape{1, 1, 1, 1, 4, 2}, data_types::f16, format::bfwzyx}, {4, 5, 2, 3, 0, 1}, true, false },
+    }));
+}  // skip permute tests