PaddlePaddle · zhupengyang · Aug 24, 2023 · Jul 12, 2023 · Jul 13, 2023 · Jul 18, 2023
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -239,6 +239,8 @@ if(WITH_XPU)
   pass_library(cast_mixed_precision_op_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
   pass_library(yolo_box_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
+  pass_library(cast_embedding_trans_ids_to_int32_pass inference DIR xpu DEPS
+               ${XPU_PASS_DEPS})
   pass_library(conv1d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(conv2d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(redundant_unsqueeze_squeeze_elimination_pass inference DIR xpu

diff --git a/paddle/fluid/framework/ir/xpu/cast_embedding_trans_ids_to_int32_pass.cc b/paddle/fluid/framework/ir/xpu/cast_embedding_trans_ids_to_int32_pass.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct CastEmbeddingTransIdsToInt32Pattern : public PatternBase {
+  CastEmbeddingTransIdsToInt32Pattern(PDPattern* pattern,
+                                      const std::string& name_scope);
+  // declare operator node's name
+  PATTERN_DECL_NODE(cast);
+  PATTERN_DECL_NODE(embedding);
+  // declare variable node's name
+  PATTERN_DECL_NODE(cast_x);
+  PATTERN_DECL_NODE(embedding_ids);
+  PATTERN_DECL_NODE(embedding_w);
+  PATTERN_DECL_NODE(embedding_out);
+};
+
+CastEmbeddingTransIdsToInt32Pattern::CastEmbeddingTransIdsToInt32Pattern(
+    PDPattern* pattern, const std::string& name_scope)
+    : PatternBase(pattern, name_scope, name_scope) {
+  auto cast = pattern->NewNode(cast_repr())->assert_is_op("cast");
+  auto cast_x = pattern->NewNode(cast_x_repr())
+                    ->assert_is_op_input("cast", "X")
+                    ->assert_var_not_persistable()
+                    ->AsInput();
+  auto embedding_ids = pattern->NewNode(embedding_ids_repr())
+                           ->assert_is_op_output("cast", "Out")
+                           ->assert_is_op_input("lookup_table_v2", "Ids")
+                           ->assert_has_n_outputs(1);
+  cast->LinksFrom({cast_x}).LinksTo({embedding_ids});
+  auto embedding_w = pattern->NewNode(embedding_w_repr())
+                         ->assert_is_op_input("lookup_table_v2", "W");
+  auto embedding =
+      pattern->NewNode(embedding_repr())->assert_is_op("lookup_table_v2");
+  auto embedding_out = pattern->NewNode(embedding_out_repr())
+                           ->assert_is_op_output("lookup_table_v2", "Out")
+                           ->AsOutput();
+  embedding->LinksFrom({embedding_ids, embedding_w}).LinksTo({embedding_out});
+}
+
+}  // namespace patterns
+
+class CastEmbeddingTransIdsToInt32Pass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  const std::string name_scope_{"cast_embedding_trans_ids_to_int32_pass"};
+};
+void CastEmbeddingTransIdsToInt32Pass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+
+  GraphPatternDetector gpd;
+  patterns::CastEmbeddingTransIdsToInt32Pattern pattern(gpd.mutable_pattern(),
+                                                        name_scope_);
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle CastEmbeddingTransIdsToInt32Pass";
+    GET_IR_NODE(cast);
+    GET_IR_NODE(embedding);
+    GET_IR_NODE(embedding_ids);
+    auto* block = cast->Op()->Block();
+    auto cast_node_attr_in_dtype = cast->Op()->GetAttrIfExists<int>("in_dtype");
+    auto cast_node_attr_out_dtype =
+        cast->Op()->GetAttrIfExists<int>("out_dtype");
+    if (cast_node_attr_in_dtype !=
+            static_cast<int>(paddle::framework::proto::VarType::FP32) &&
+        cast_node_attr_out_dtype !=
+            static_cast<int>(paddle::framework::proto::VarType::INT64)) {
+      return;
+    }
+    cast->Op()->SetAttr(
+        "out_dtype",
+        static_cast<int>(paddle::framework::proto::VarType::INT32));
+    embedding_ids->Var()->SetDataType(paddle::framework::proto::VarType::INT32);
+    embedding->Op()->Flush();
+    found_subgraph_count++;
+  };
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cast_embedding_trans_ids_to_int32_pass,
+              paddle::framework::ir::CastEmbeddingTransIdsToInt32Pass);
+
+REGISTER_PASS_CAPABILITY(cast_embedding_trans_ids_to_int32_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().LE(
+            "lookup_table_v2", 1));
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -516,6 +516,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "reshape_unstack_concat_fuse_pass",
       "delete_op_device_pass",
       "constant_folding_pass",
+      "cast_embedding_trans_ids_to_int32_pass",
       "delete_elementwise_mul_op_pass",
       "generate_sequence_xpu_fuse_pass",
       "embedding_with_eltwise_add_xpu_fuse_pass",

diff --git a/paddle/phi/kernels/xpu/embedding_kernel.cc b/paddle/phi/kernels/xpu/embedding_kernel.cc
@@ -44,18 +44,6 @@ void EmbeddingKernel(const Context &ctx,
   auto *table = table_t->data<T>();
   auto *output = dev_ctx.template Alloc<T>(output_t);
 
-  xpu::ctx_guard RAII_GUARD(ctx.x_context());
-  const int64_t *ids;
-  if (ids_t->dtype() == phi::DataType::INT64) {
-    ids = ids_t->data<int64_t>();
-  } else {
-    int64_t *ids_tt = RAII_GUARD.alloc_l3_or_gm<int64_t>(ids_t->numel());
-    int r = xpu::cast<int32_t, int64_t>(
-        ctx.x_context(), ids_t->data<int>(), ids_tt, ids_t->numel());
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
-    ids = reinterpret_cast<const int64_t *>(ids_tt);
-  }
-
   PADDLE_ENFORCE_EQ(
       ids_numel <= std::numeric_limits<int32_t>::max(),
       true,
@@ -68,15 +56,57 @@ void EmbeddingKernel(const Context &ctx,
   size_t xm = table_t->dims()[0];
   size_t n = table_t->dims()[1];
 
-  int r = xpu::embedding<XPUType>(dev_ctx.x_context(),
-                                  reinterpret_cast<const XPUType *>(table),
-                                  ids,
-                                  reinterpret_cast<XPUType *>(output),
-                                  xm,
-                                  n,
-                                  ym,
-                                  padding_idx);
-
+  int r;
+  xpu::ctx_guard RAII_GUARD(ctx.x_context());
+  if (ids_t->dtype() == phi::DataType::INT64) {
+#ifndef PADDLE_WITH_XPU_PLUGIN
+    r = xpu::embedding<XPUType, int64_t>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType *>(table),
+        ids_t->data<int64_t>(),
+        reinterpret_cast<XPUType *>(output),
+        xm,
+        n,
+        ym,
+        padding_idx);
+#else
+    r = xpu::plugin::embedding_tiny_dict<XPUType, int64_t>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType *>(table),
+        ids_t->data<int64_t>(),
+        reinterpret_cast<XPUType *>(output),
+        xm,
+        n,
+        ym,
+        padding_idx);
+#endif
+  } else {
+#ifndef PADDLE_WITH_XPU_PLUGIN
+    int64_t *ids_tt = RAII_GUARD.alloc_l3_or_gm<int64_t>(ids_t->numel());
+    r = xpu::cast<int32_t, int64_t>(
+        ctx.x_context(), ids_t->data<int>(), ids_tt, ids_t->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+    const int64_t *ids = reinterpret_cast<const int64_t *>(ids_tt);
+    r = xpu::embedding<XPUType>(dev_ctx.x_context(),
+                                reinterpret_cast<const XPUType *>(table),
+                                ids,
+                                reinterpret_cast<XPUType *>(output),
+                                xm,
+                                n,
+                                ym,
+                                padding_idx);
+#else
+    r = xpu::plugin::embedding_tiny_dict<XPUType, int>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType *>(table),
+        ids_t->data<int>(),
+        reinterpret_cast<XPUType *>(output),
+        xm,
+        n,
+        ym,
+        padding_idx);
+#endif
+  }
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "embedding");
 }
 

diff --git a/paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h b/paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h
@@ -104,6 +104,17 @@ DLL_EXPORT int fast_reduce_min(Context* ctx,
                                const std::vector<int>& xshape,
                                const std::vector<int>& rdims);
 
+template <typename T, typename TID>
+DLL_EXPORT int embedding_tiny_dict(Context* ctx,
+                                   const T* x,
+                                   const TID* indices,
+                                   T* y,
+                                   int64_t xm,
+                                   int64_t n,
+                                   int64_t ym,
+                                   int64_t padding_idx,
+                                   TID start_index = 0);
+
 }  // namespace plugin
 }  // namespace api
 }  // namespace xpu