diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
deleted file mode 100644
index b1a8e81a68add1..00000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/infermeta/binary.h"
-
-namespace paddle {
-namespace operators {
-
-class KLDivLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class KLDivLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of KL divergence loss operator. "
-             "This is a tensor with shape of [N, *], where N is the "
-             "batch size, * means any number of additional dimensions. "
-             "The data type is float32 or flaot64");
-    AddInput("Target",
-             "The  tensor of KL divergence loss operator. "
-             "This is a tensor with shape of Input(X). "
-             "The data type is same as Input(X)");
-    AddOutput(
-        "Loss",
-        "The output KL divergence loss tensor. if Attr(reduction) is "
-        "'none', this tensor should be in same shape of of Input(X), else "
-        "this tensor should be in shape of [1].");
-
-    AddAttr<std::string>(
-        "reduction",
-        "The reduction type to apply to the output, available types "
-        "are 'none' | 'batchmean' | 'mean' | 'sum', 'none' for no "
-        "reduction, 'batchmean' for the sum of output divided by "
-        "batch size, 'mean' for the average value of all output, "
-        "'sum' for the sum of the output.")
-        .SetDefault("mean");
-
-    AddComment(R"DOC(
-         This operator calculates the Kullback-Leibler divergence loss
-         between Input(X) and Input(Target). Notes that Input(X) is the
-         log-probability and Input(Target) is the probability.
-
-         KL divergence loss is calculated as follows:
-
-         $$l(x, y) = y * (\log(y) - x)$$
-
-         While :math:`x` is Input(X) and :math:`y` is Input(Target).
-
-         While :attr:`reduction` is :attr:`none`, output loss is in
-         the same shape as Input(X), loss in each point is calculated
-         separately and no reduction is applied.
-
-         While :attr:`reduction` is :attr:`mean`, output loss is in
-         shape of [1] and loss value is the mean value of all losses.
-
-         While :attr:`reduction` is :attr:`sum`, output loss is in
-         shape of [1] and loss value is the sum value of all losses.
-
-         While :attr:`reduction` is :attr:`batchmean`, output loss is
-         in shape of [1] and loss value is the sum value of all losses
-         divided by batch size.
-
-         )DOC");
-  }
-};
-
-class KLDivLossOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLossGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLossGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input",
-                   "Loss@GRAD",
-                   "KLDivLossGrad");
-    auto dim_x = ctx->GetInputDim("X");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Loss")),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class KLDivLossOpGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("kldiv_loss_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Target", this->Input("Target"));
-    op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
-
-    op->SetAttrMap(this->Attrs());
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss,
-                            KLDivInferShapeFunctor,
-                            PD_INFER_META(phi::KLDivInferMeta));
-
-REGISTER_OPERATOR(kldiv_loss,
-                  ops::KLDivLossOp,
-                  ops::KLDivLossOpMaker,
-                  ops::KLDivLossOpGradMaker<paddle::framework::OpDesc>,
-                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>,
-                  KLDivInferShapeFunctor);
-REGISTER_OPERATOR(kldiv_loss_grad,
-                  ops::KLDivLossOpGrad,
-                  ops::KLDivLossGradNoNeedBufferVarInferer);
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index c7d2bd40816de6..062c98f445b1be 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -744,6 +744,17 @@
   kernel :
     func : inverse_grad
 
+- backward_op : kldiv_loss_grad
+  forward : kldiv_loss(Tensor x, Tensor label, str reduction="mean") -> Tensor(out)
+  args : (Tensor x, Tensor label, Tensor out_grad, str reduction)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : kldiv_loss_grad
+  no_need_buffer : x
+
 - backward_op : kron_grad
   forward : kron (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 6f6da65453362a..3b2a3f4b6d6d70 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -597,17 +597,6 @@
   optional : scale
   backward : instance_norm_double_grad
 
-- backward_op : kldiv_loss_grad
-  forward : kldiv_loss(Tensor x, Tensor label, str reduction) -> Tensor(out)
-  args : (Tensor x, Tensor label, Tensor out_grad, str reduction)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param: [x]
-  kernel :
-    func : kldiv_loss_grad
-  no_need_buffer : x
-
 - backward_op : layer_norm_grad
   forward : layer_norm (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis) -> Tensor(out), Tensor(mean), Tensor(variance)
   args : (Tensor x,  Tensor scale, Tensor bias, Tensor mean, Tensor variance, Tensor out_grad, float epsilon, int begin_norm_axis)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index f399ab6e577399..2b1a0c12348b42 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -853,16 +853,6 @@
   intermediate : saved_mean, saved_variance
   backward : instance_norm_grad
 
-- op : kldiv_loss
-  args : (Tensor x, Tensor label, str reduction)
-  output : Tensor(out)
-  infer_meta :
-    func : KLDivInferMeta
-  kernel :
-    func : kldiv_loss
-    data_type : x
-  backward : kldiv_loss_grad
-
 - op : layer_norm
   args : (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis)
   output : Tensor(out), Tensor(mean), Tensor(variance)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 302d0ce4025b9b..7e470b5c2dae77 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -964,6 +964,13 @@
   outputs :
     out : Out
 
+- op : kldiv_loss
+  backward : kldiv_loss_grad
+  inputs :
+    {x : X, label : Target}
+  outputs :
+    out : Loss
+
 - op : kron
   backward : kron_grad
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 03e2e3aef0d93d..b0510948d53d5a 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -775,6 +775,16 @@
     func : isnan {dense -> dense},
            isnan_sr {selected_rows -> selected_rows}
 
+- op : kldiv_loss
+  args : (Tensor x, Tensor label, str reduction = "mean")
+  output : Tensor(out)
+  infer_meta :
+    func : KLDivInferMeta
+  kernel :
+    func : kldiv_loss
+    data_type : x
+  backward : kldiv_loss_grad
+
 - op : kron
   args : (Tensor x, Tensor y)
   output : Tensor
diff --git a/paddle/phi/ops/compat/kldiv_loss_sig.cc b/paddle/phi/ops/compat/kldiv_loss_sig.cc
deleted file mode 100644
index 8af0edd3164874..00000000000000
--- a/paddle/phi/ops/compat/kldiv_loss_sig.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature KLDivLossGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("kldiv_loss_grad",
-                         {"X", "Target", "Loss@GRAD"},
-                         {"reduction"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-PD_REGISTER_ARG_MAPPING_FN(kldiv_loss_grad,
-                           phi::KLDivLossGradOpArgumentMapping);