PaddlePaddle · Wangzheee · Dec 8, 2023 · Dec 6, 2023 · Dec 6, 2023
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
@@ -216,6 +216,23 @@ void TensorRTEngine::FreezeNetwork() {
     }
   }
 
+  if (precision() == phi::DataType::BFLOAT16) {
+#if IS_TRT_VERSION_GE(9000)
+    infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kBF16);
+    LOG(INFO) << "Run Paddle-TRT BF16 mode";
+#else
+    infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
+    bool support_fp16 = infer_builder_->platformHasFastFp16();
+    if (!support_fp16) {
+      LOG(INFO) << "Because the version of TensorRT is less than 9.0, and the "
+                   "hardware do not support FP16, run Paddle-TRT FP32 mode";
+    } else {
+      LOG(INFO) << "Because the version of TensorRT is less than 9.0, run "
+                   "Paddle-TRT FP16 mode";
+    }
+#endif
+  }
+
   bool enable_int8 = (precision() == phi::DataType::INT8);
   if (enable_int8) {
     if (!use_dla()) {

diff --git a/test/ir/inference/test_trt_matmul.py b/test/ir/inference/test_trt_matmul.py
@@ -167,5 +167,49 @@ def test_check_output(self):
             )
 
 
+class TensorRTMatMulBroadcastBF16Test(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        place = base.CPUPlace()
+        with base.program_guard(self.main_program, self.startup_program):
+            data_x = paddle.static.data(
+                name="data_x", shape=[-1, 6, 24], dtype="float32"
+            )
+            data_y = paddle.static.data(
+                name="data_y", shape=[24, 16], dtype="float32"
+            )
+            matmul_out = paddle.matmul(
+                x=data_x,
+                y=data_y,
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y,
+            )
+            matmul_out = paddle.scale(matmul_out, scale=self.alpha)
+            out = nn.batch_norm(matmul_out, is_test=True)
+
+        self.feeds = {
+            "data_x": np.ones([2, 6, 24]).astype("float32"),
+            "data_y": np.ones([24, 16]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTMatMulBroadcastTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Bfloat16, False, False
+        )
+        self.fetch_list = [out]
+
+    def set_params(self):
+        self.transpose_x = False
+        self.transpose_y = False
+        self.alpha = 1.0
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
+            )
+
+
 if __name__ == "__main__":
     unittest.main()