diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index 05de2abf071..9810278effe 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -61,7 +61,7 @@ add_kernel(strided_slice_compute_host Host extra SRCS strided_slice_compute.cc D
 add_kernel(tile_compute_host Host extra SRCS tile_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(topk_v2_compute_host Host extra SRCS topk_v2_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(fill_any_like_compute_host Host extra SRCS fill_any_like_compute.cc DEPS ${lite_kernel_deps})
-
+add_kernel(tril_triu_compute_host Host extra SRCS tril_triu_compute.cc DEPS ${lite_kernel_deps})
 
 if(LITE_BUILD_EXTRA AND LITE_WITH_x86)
   lite_cc_test(test_where_index_compute_host SRCS where_index_compute.cc DEPS where_index_compute_host)
diff --git a/lite/kernels/host/expand_as_compute.cc b/lite/kernels/host/expand_as_compute.cc
index a291068abe3..4edb9504c54 100644
--- a/lite/kernels/host/expand_as_compute.cc
+++ b/lite/kernels/host/expand_as_compute.cc
@@ -30,7 +30,6 @@ void ExpandAsCompute<T, PType>::Run() {
   const T* src = x->template data<T>();
   T* dst = out->template mutable_data<T>();
 
-  // int dims = expand_times.size();
   for (int i = 0; i < target->dims().size(); ++i) {
     int times = target->dims()[i] / x->dims()[i];
     expand_times.push_back(times);
@@ -75,12 +74,29 @@ REGISTER_LITE_KERNEL(expand_as, kHost, kFloat, kAny, expand_as_float, def)
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kAny))})
-    .BindInput("Target",
+    .BindInput("target_tensor",
                {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
+                                      PRECISION(kAny),
                                       DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kHost),
                                        PRECISION(kFloat),
                                        DATALAYOUT(kAny))})
     .Finalize();
+
+using expand_as_int64 =
+    paddle::lite::kernels::host::ExpandAsCompute<int64_t, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(expand_as, kHost, kFloat, kAny, expand_as_int64, int64)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kAny))})
+    .BindInput("target_tensor",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt64),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
diff --git a/lite/kernels/host/tril_triu_compute.cc b/lite/kernels/host/tril_triu_compute.cc
new file mode 100644
index 00000000000..c88dec80be0
--- /dev/null
+++ b/lite/kernels/host/tril_triu_compute.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/tril_triu_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <class T>
+void TrilTriu(const T* in,
+              const int64_t diagonal,
+              const bool lower,
+              const int64_t h,
+              const int64_t w,
+              T* out) {
+  int64_t size = h * w;
+  for (int64_t idx = 0; idx < size; idx++) {
+    const int64_t row = idx / w;
+    const int64_t col = idx % w;
+    const bool mask = lower ? (col - row > diagonal) : (col - row < diagonal);
+    out[idx] = mask ? 0 : in[idx];
+  }
+  return;
+}
+
+template <class T>
+void TrilTriuCompute<T>::Run() {
+  auto& param = this->template Param<param_t>();
+  const lite::Tensor* x = param.x;
+  lite::Tensor* out = param.out;
+  int64_t diagonal = param.diagonal;
+  bool lower = param.lower;
+
+  const T* x_data = x->template data<T>();
+  T* out_data = out->template mutable_data<T>();
+  auto x_dims = x->dims();
+  int64_t h = x_dims[x_dims.size() - 2];
+  int64_t w = x_dims[x_dims.size() - 1];
+  int64_t n = x_dims.production() / h / w;
+
+  for (int64_t i = 0; i < n; i++) {
+    TrilTriu(x_data, diagonal, lower, h, w, out_data);
+    x_data += h * w;
+    out_data += h * w;
+  }
+  return;
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using TrilTriuFloat32 = paddle::lite::kernels::host::TrilTriuCompute<float>;
+REGISTER_LITE_KERNEL(tril_triu, kHost, kAny, kNCHW, TrilTriuFloat32, float32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/host/tril_triu_compute.h b/lite/kernels/host/tril_triu_compute.h
new file mode 100644
index 00000000000..461beba2f1c
--- /dev/null
+++ b/lite/kernels/host/tril_triu_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <class T>
+class TrilTriuCompute : public KernelLite<TARGET(kHost), PRECISION(kAny)> {
+ public:
+  using param_t = operators::TrilTriuParam;
+
+  void Run() override;
+
+  virtual ~TrilTriuCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 6da165c9473..b591fb99371 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -144,6 +144,7 @@ add_operator(tensor_array_to_tensor_op extra SRCS tensor_array_to_tensor_op.cc D
 add_operator(expand_v2_op_lite extra SRCS expand_v2_op.cc DEPS ${op_DEPS})
 add_operator(tile_op extra SRCS tile_op.cc DEPS ${op_DEPS})
 add_operator(sum_op extra SRCS sum_op.cc DEPS ${op_DEPS})
+add_operator(tril_triu_op extra SRCS tril_triu_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
diff --git a/lite/operators/expand_as_op.cc b/lite/operators/expand_as_op.cc
index 992e987d901..0f4203635d7 100644
--- a/lite/operators/expand_as_op.cc
+++ b/lite/operators/expand_as_op.cc
@@ -48,7 +48,7 @@ bool ExpandAsOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   auto Out_name = opdesc.Output("Out").front();
   param_.X = GetVar<lite::Tensor>(scope, X_name);
   param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-  auto Target_name = opdesc.Input("Target").front();
+  auto Target_name = opdesc.Input("target_tensor").front();
   param_.Target = GetVar<lite::Tensor>(scope, Target_name);
   return true;
 }
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index b1d777d1bf0..6ef08baaf53 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -646,6 +646,14 @@ struct TransposeParam : ParamBase {
   }
 };
 
+struct TrilTriuParam : ParamBase {
+  const lite::Tensor* x{nullptr};
+  lite::Tensor* out{nullptr};
+
+  int diagonal{0};
+  bool lower{true};
+};
+
 /// ----------------------- element wise operators ----------------------
 struct ElementwiseParam : ParamBase {
   const lite::Tensor* X{};
diff --git a/lite/operators/tril_triu_op.cc b/lite/operators/tril_triu_op.cc
new file mode 100644
index 00000000000..819bffe7b99
--- /dev/null
+++ b/lite/operators/tril_triu_op.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/tril_triu_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool TrilTriuOp::CheckShape() const {
+  CHECK(param_.x);
+  CHECK(param_.out);
+  return true;
+}
+
+bool TrilTriuOp::InferShapeImpl() const {
+  CHECK_GE(param_.x->dims().size(), 2UL);
+  param_.out->Resize(param_.x->dims());
+  param_.out->set_lod(param_.x->lod());
+  return true;
+}
+
+bool TrilTriuOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  param_.x = scope->FindTensor(op_desc.Input("X").front());
+  param_.out = scope->FindMutableTensor(op_desc.Output("Out").front());
+
+  param_.diagonal = op_desc.GetAttr<int>("diagonal");
+  param_.lower = op_desc.GetAttr<bool>("lower");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(tril_triu, paddle::lite::operators::TrilTriuOp);
diff --git a/lite/operators/tril_triu_op.h b/lite/operators/tril_triu_op.h
new file mode 100644
index 00000000000..90d38397f65
--- /dev/null
+++ b/lite/operators/tril_triu_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class TrilTriuOp : public OpLite {
+ public:
+  TrilTriuOp() {}
+  explicit TrilTriuOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "tril_triu"; }
+
+ private:
+  mutable TrilTriuParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index 9585143448f..7bbefba8db1 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -99,6 +99,7 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_sequence_expand_as_compute SRCS sequence_expand_as_compute_test.cc DEPS ${test_kernel_deps})
     lite_cc_test(test_kernel_sin_compute SRCS sin_compute_test.cc DEPS arena_framework ${test_kernel_deps})
     lite_cc_test(test_kernel_cos_compute SRCS cos_compute_test.cc DEPS arena_framework ${test_kernel_deps})
+    lite_cc_test(test_kernel_tril_triu_compute SRCS tril_triu_compute_test.cc DEPS arena_framework ${test_kernel_deps})
     lite_cc_test(test_kernel_pad3d_compute SRCS  pad3d_compute_test.cc DEPS arena_framework ${test_kernel_deps})
     lite_cc_test(test_kernel_select_input_compute SRCS select_input_compute_test.cc DEPS arena_framework ${test_kernel_deps})
     # lite_cc_test(test_kernel_tensor_array_to_tensor_compute SRCS tensor_array_to_tensor_compute_test.cc DEPS arena_framework ${test_kernel_deps})
diff --git a/lite/tests/kernels/expand_as_compute_test.cc b/lite/tests/kernels/expand_as_compute_test.cc
index 193a8fd59e5..252aea323a2 100644
--- a/lite/tests/kernels/expand_as_compute_test.cc
+++ b/lite/tests/kernels/expand_as_compute_test.cc
@@ -16,10 +16,12 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
 
+template <class T>
 class ExpandAsComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
@@ -55,8 +57,8 @@ class ExpandAsComputeTester : public arena::TestCase {
       out_shape[i] *= expand_times_[i];
     }
     out->Resize(out_shape);
-    float* out_data = out->mutable_data<float>();
-    const float* input_data = input->data<float>();
+    T* out_data = out->template mutable_data<T>();
+    const T* input_data = input->template data<T>();
     std::vector<int> in_stride(in_shape.size(), 1),
         out_stride(out_shape.size(), 1);
     for (int i = in_shape.size() - 2; i >= 0; --i) {
@@ -78,30 +80,49 @@ class ExpandAsComputeTester : public arena::TestCase {
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("expand_as");
     op_desc->SetInput("X", {x_});
-    op_desc->SetInput("Target", {target_});
+    op_desc->SetInput("target_tensor", {target_});
     op_desc->SetOutput("Out", {out_});
   }
 
   void PrepareData() override {
-    std::vector<float> in_data(dims_.production());
-    std::vector<float> target_data(target_dims_.production());
-    for (int i = 0; i < dims_.production(); ++i) {
-      in_data[i] = i;
-    }
-    for (int i = 0; i < target_dims_.production(); ++i) {
-      target_data[i] = i;
-    }
+    std::vector<T> in_data(dims_.production());
+    fill_data_rand(in_data.data(),
+                   static_cast<T>(-10),
+                   static_cast<T>(10),
+                   dims_.production());
     SetCommonTensor(x_, dims_, in_data.data());
+
+    std::vector<T> target_data(target_dims_.production());
+    fill_data_rand(target_data.data(),
+                   static_cast<T>(-10),
+                   static_cast<T>(10),
+                   target_dims_.production());
     SetCommonTensor(target_, target_dims_, target_data.data());
+    return;
   }
 };
 
+template <class T>
 void test_expand_as_3dim(Place place, float abs_error) {
+  auto precision = lite_api::PrecisionTypeTrait<T>::Type();
+  std::string alias("def");
+  switch (precision) {
+    case lite_api::PrecisionType::kFloat:
+      alias = std::string("def");
+      break;
+    case lite_api::PrecisionType::kInt64:
+      alias = std::string("int64");
+      break;
+    default:
+      LOG(FATAL) << "unsupported precision: "
+                 << lite_api::PrecisionToStr(precision);
+  }
+
   for (int C : {3}) {
     for (int H : {2}) {
       for (int W : {4}) {
-        std::unique_ptr<arena::TestCase> tester(new ExpandAsComputeTester(
-            place, "def", DDim({C, H, W}), DDim({C * 2, H * 3, W * 1})));
+        std::unique_ptr<arena::TestCase> tester(new ExpandAsComputeTester<T>(
+            place, alias, DDim({C, H, W}), DDim({C * 2, H * 3, W * 1})));
         arena::Arena arena(std::move(tester), place, abs_error);
         arena.TestPrecision();
       }
@@ -109,16 +130,31 @@ void test_expand_as_3dim(Place place, float abs_error) {
   }
 }
 
+template <class T>
 void test_expand_as_4dim(Place place, float abs_error) {
+  auto precision = lite_api::PrecisionTypeTrait<T>::Type();
+  std::string alias("def");
+  switch (precision) {
+    case lite_api::PrecisionType::kFloat:
+      alias = std::string("def");
+      break;
+    case lite_api::PrecisionType::kInt64:
+      alias = std::string("int64");
+      break;
+    default:
+      LOG(FATAL) << "unsupported precision: "
+                 << lite_api::PrecisionToStr(precision);
+  }
+
   for (int N : {2}) {
     for (int C : {3}) {
       for (int H : {2}) {
         for (int W : {4}) {
           std::unique_ptr<arena::TestCase> tester(
-              new ExpandAsComputeTester(place,
-                                        "def",
-                                        DDim({N, C, H, W}),
-                                        DDim({N * 2, C * 3, H * 1, W * 4})));
+              new ExpandAsComputeTester<T>(place,
+                                           alias,
+                                           DDim({N, C, H, W}),
+                                           DDim({N * 2, C * 3, H * 1, W * 4})));
           arena::Arena arena(std::move(tester), place, abs_error);
           arena.TestPrecision();
         }
@@ -130,19 +166,17 @@ void test_expand_as_4dim(Place place, float abs_error) {
 TEST(ExpandAs, precision) {
   float abs_error = 1e-5;
   Place place;
-#if defined(LITE_WITH_NPU)
-  place = TARGET(kNPU);
-  abs_error = 1e-2;  // Using fp16 in NPU
-#elif defined(LITE_WITH_ARM)
-  place = TARGET(kHost);
-#elif defined(LITE_WITH_X86)
+#if defined(LITE_WITH_ARM) || defined(LITE_WITH_X86)
   place = TARGET(kHost);
 #else
   return;
 #endif
 
-  test_expand_as_3dim(place, abs_error);
-  test_expand_as_4dim(place, abs_error);
+  test_expand_as_3dim<float>(place, abs_error);
+  test_expand_as_4dim<float>(place, abs_error);
+
+  test_expand_as_3dim<int64_t>(place, abs_error);
+  test_expand_as_4dim<int64_t>(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/tril_triu_compute_test.cc b/lite/tests/kernels/tril_triu_compute_test.cc
new file mode 100644
index 00000000000..e88a1cb3b51
--- /dev/null
+++ b/lite/tests/kernels/tril_triu_compute_test.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+template <class T>
+class TrilTriuComputeTester : public arena::TestCase {
+ protected:
+  std::string x_ = "X";
+  std::string out_ = "Out";
+  DDim x_dims_;
+  int diagonal_{0};
+  bool lower_{true};
+
+ public:
+  TrilTriuComputeTester(const Place& place,
+                        const std::string& alias,
+                        const DDim& x_dims,
+                        const int diagonal = 0,
+                        const bool lower = true)
+      : TestCase(place, alias),
+        x_dims_(x_dims),
+        diagonal_(diagonal),
+        lower_(lower) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(out_);
+    auto* x = scope->FindTensor(x_);
+    out->Resize(x_dims_);
+    out->set_lod(x->lod());
+
+    auto* x_data = x->template data<T>();
+    auto* out_data = out->template mutable_data<T>();
+    auto h = x_dims_[x_dims_.size() - 2];
+    auto w = x_dims_[x_dims_.size() - 1];
+    auto n = x_dims_.production() / h / w;
+
+    for (int64_t i = 0; i < n; i++) {
+      for (int64_t idx = 0; idx < h * w; idx++) {
+        auto row = idx / w;
+        auto col = idx % w;
+        bool mask = lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
+        out_data[idx] = mask ? 0 : x_data[idx];
+      }
+      x_data += h * w;
+      out_data += h * w;
+    }
+    return;
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("tril_triu");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("diagonal", diagonal_);
+    op_desc->SetAttr("lower", lower_);
+    return;
+  }
+
+  void PrepareData() override {
+    std::vector<T> din(x_dims_.production());
+    fill_data_rand(din.data(),
+                   static_cast<T>(-10),
+                   static_cast<T>(10),
+                   x_dims_.production());
+    SetCommonTensor(x_, x_dims_, din.data());
+    return;
+  }
+};
+
+template <class T = float>
+void TestTrilTriuHelper(Place place,
+                        float abs_error,
+                        const std::vector<int64_t> x_dims,
+                        const int diagonal = 0,
+                        const bool lower = true) {
+  auto precision = lite_api::PrecisionTypeTrait<T>::Type();
+  std::string alias("def");
+  switch (precision) {
+    case lite_api::PrecisionType::kFloat:
+      alias = std::string("float32");
+      break;
+    default:
+      LOG(FATAL) << "unsupported precision: "
+                 << lite_api::PrecisionToStr(precision);
+  }
+
+  std::unique_ptr<arena::TestCase> tester(new TrilTriuComputeTester<T>(
+      place, alias, DDim(x_dims), diagonal, lower));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+TEST(cumsum, precision) {
+  Place place;
+  float abs_error = 1e-5;
+#if defined(LITE_WITH_ARM) || defined(LITE_WITH_X86)
+  place = TARGET(kHost);
+#else
+  return;
+#endif
+
+  for (auto x_shape :
+       std::vector<std::vector<int64_t>>{{3, 4}, {5, 6, 7}, {5, 6, 7, 8}}) {
+    for (auto lower : {true, false}) {
+      for (auto diagonal : {-1, 0, 2}) {
+        TestTrilTriuHelper<float>(place, abs_error, x_shape, diagonal, lower);
+      }
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle