diff --git a/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu
index 2ac6442967b38e..1e5b07d621b429 100644
--- a/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu
@@ -15,20 +15,23 @@
 #include "paddle/phi/kernels/label_smooth_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 namespace phi {
 template <typename T>
 struct LabelSmoothGradFunctor {
-  T epsilon;
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  MT epsilon;
 
   __forceinline__ LabelSmoothGradFunctor(float epsilon_data) {
-    epsilon = static_cast<T>(epsilon_data);
+    epsilon = static_cast<MT>(epsilon_data);
   }
 
   __device__ __forceinline__ T operator()(const T x) const {
-    return static_cast<T>(1 - epsilon) * x;
+    return static_cast<T>((1 - epsilon) * static_cast<MT>(x));
   }
 };
 
@@ -52,4 +55,5 @@ PD_REGISTER_KERNEL(label_smooth_grad,
                    ALL_LAYOUT,
                    phi::LabelSmoothGradKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/label_smooth_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
index ff2fff4445174a..b56276dcd76542 100644
--- a/paddle/phi/kernels/gpu/label_smooth_kernel.cu
+++ b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
@@ -17,6 +17,8 @@
 #include <vector>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
@@ -24,17 +26,18 @@ namespace phi {
 
 template <typename T>
 struct LabelSmoothFunctor {
-  T epsilon;
-  T label_dim;
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  MT epsilon;
+  MT label_dim;
 
   __forceinline__ LabelSmoothFunctor(float epsilon_data, int label_dim_data) {
-    epsilon = static_cast<T>(epsilon_data);
-    label_dim = static_cast<T>(label_dim_data);
+    epsilon = static_cast<MT>(epsilon_data);
+    label_dim = static_cast<MT>(label_dim_data);
   }
 
   __device__ __forceinline__ T operator()(const T x) const {
-    return (static_cast<T>(1 - epsilon) * x +
-            static_cast<T>(epsilon / label_dim));
+    return static_cast<T>((1 - epsilon) * static_cast<MT>(x) +
+                          epsilon / label_dim);
   }
 };
 
@@ -46,9 +49,11 @@ __global__ void LabelSmoothRunDistKernel(const int N,
                                          const T* dist_data,
                                          T* dst) {
   CUDA_KERNEL_LOOP(idx, N) {
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     int dist_idx = idx % dist_numel;
-    dst[idx] = static_cast<T>(1 - epsilon) * src[idx] +
-               static_cast<T>(epsilon) * dist_data[dist_idx];
+    dst[idx] = static_cast<T>(
+        static_cast<MT>((1 - epsilon) * static_cast<MT>(src[idx])) +
+        static_cast<MT>(epsilon) * static_cast<MT>(dist_data[dist_idx]));
   }
 }
 
@@ -83,5 +88,10 @@ void LabelSmoothKernel(const Context& ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    label_smooth, GPU, ALL_LAYOUT, phi::LabelSmoothKernel, float, double) {}
+PD_REGISTER_KERNEL(label_smooth,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LabelSmoothKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
index b62a75438a7fd4..8cff0bfa3a455b 100644
--- a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
+++ b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
@@ -18,15 +18,19 @@
 from op_test import OpTest
 
 import paddle
+import paddle.fluid.core as core
 
 
 class TestLabelSmoothOp(OpTest):
     def config(self):
+        self.init_data_type()
         self.op_type = "label_smooth"
         self.python_api = paddle.nn.functional.label_smooth
         self.epsilon = 0.1
         batch_size, self.label_dim = 10, 12
-        self.label = np.zeros((batch_size, self.label_dim)).astype("float64")
+        self.label = np.zeros((batch_size, self.label_dim)).astype(
+            self.data_type
+        )
         nonzero_index = np.random.randint(self.label_dim, size=(batch_size))
         self.label[np.arange(batch_size), nonzero_index] = 1
 
@@ -39,6 +43,9 @@ def setUp(self):
         self.attrs = {'epsilon': self.epsilon}
         self.outputs = {'Out': smoothed_label}
 
+    def init_data_type(self):
+        self.data_type = "float64"
+
     def test_check_output(self):
         self.check_output(check_eager=True)
 
@@ -78,6 +85,42 @@ def setUp(self):
         )
 
 
+class TestLabelSmoothFP16(TestLabelSmoothOp):
+    def init_data_type(self):
+        self.data_type = "float16"
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', max_relative_error=1e-3)
+
+
+class TestLabelSmoothStaticFP16(unittest.TestCase):
+    def test_fp16(self):
+        paddle.enable_static()
+
+        epsilon = 0.1
+        shape = [10, 12]
+        batch_size, label_dim = shape
+        label = np.zeros((batch_size, label_dim)).astype("float16")
+        nonzero_index = np.random.randint(label_dim, size=(batch_size))
+        label[np.arange(batch_size), nonzero_index] = 1
+        output_np = (1 - epsilon) * label + epsilon / label_dim
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x_data = paddle.static.data(shape=shape, name='x', dtype='float16')
+            out = paddle.nn.functional.label_smooth(x_data, epsilon=epsilon)
+            if core.is_compiled_with_cuda():
+                place = paddle.CUDAPlace(0)
+                exe = paddle.static.Executor(place)
+                exe.run(paddle.static.default_startup_program())
+                output_pd = exe.run(feed={'x': label}, fetch_list=[out])[0]
+                np.testing.assert_allclose(
+                    output_pd, output_np, rtol=1e-3, atol=1e-3
+                )
+
+        paddle.disable_static()
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index b957eb369ad8ab..7fb130ab8cdbbc 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1923,7 +1923,8 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
         label(Tensor): The input variable containing the label data. The
                         label data should use one-hot representation. It's
                         a multidimensional tensor with a shape of
-                        :math:`[N_1, ..., Depth]`, where Depth is class number. The dtype can be "float32" and "float64".
+                        :math:`[N_1, ..., Depth]`, where Depth is class number.
+                        The dtype can be "float16", "float32" and "float64".
         prior_dist(Tensor, optional): The prior distribution to be used to smooth
                         labels. If not provided, an uniform distribution
                         is used. It's a multidimensional tensor with a shape of
@@ -1965,7 +1966,7 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
         )
 
     check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'label_smooth'
+        label, 'label', ['float16', 'float32', 'float64'], 'label_smooth'
     )
 
     helper = LayerHelper("label_smooth", **locals())