diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 73bd1cb5e6c76b..d23e65303bcbab 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -272,6 +272,9 @@ struct Argument { TensorRtAllowBuildAtRuntime, bool); DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool); + DECL_ARGUMENT_FIELD(tensorrt_inspector_serialize, + TensorRtInspectorSerialize, + bool); DECL_ARGUMENT_FIELD(tensorrt_use_explicit_quantization, TensorRtUseExplicitQuantization, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index ab3f1de01bd86e..7f65b36a7f408e 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -179,6 +179,7 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("use_cuda_graph", new bool(argument->tensorrt_use_cuda_graph())); bool use_static_engine = argument->tensorrt_use_static_engine(); + bool inspector_serialize = argument->tensorrt_inspector_serialize(); bool model_from_memory = argument->model_from_memory(); std::string optim_cache_dir = argument->optim_cache_dir(); bool int8_valid = !(model_from_memory && optim_cache_dir.empty() && @@ -212,7 +213,8 @@ void IRPassManager::CreatePasses(Argument *argument, optim_cache_dir)); } pass->Set("model_opt_cache_dir", new std::string(optim_cache_dir)); - } else if (use_static_engine || enable_int8 || with_dynamic_shape) { + } else if (use_static_engine || enable_int8 || with_dynamic_shape || + inspector_serialize) { std::string model_opt_cache_dir = argument->Has("model_dir") ? argument->model_dir() @@ -224,6 +226,8 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("use_static_engine", new bool(use_static_engine)); pass->Set("model_from_memory", new bool(argument->model_from_memory())); pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector())); + pass->Set("inspector_serialize", + new bool(argument->tensorrt_inspector_serialize())); // tuned trt dynamic_shape pass->Set("trt_shape_range_info_path", diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 99f0c74770cffe..6f57b7db54c912 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -569,6 +569,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( auto use_dla = Get("trt_use_dla"); auto dla_core = Get("trt_dla_core"); auto use_inspector = Get("use_inspector"); + auto inspector_serialize = Get("inspector_serialize"); auto disable_trt_plugin_fp16 = Get("disable_trt_plugin_fp16"); auto context_memory_sharing = Get("context_memory_sharing"); auto enable_low_precision_io = Get("enable_low_precision_io"); @@ -592,7 +593,6 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( op_desc->SetAttr("parameters", parameters); op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime); op_desc->SetAttr("shape_range_info_path", shape_range_info_path); - op_desc->SetAttr("use_inspector", use_inspector); op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape); op_desc->SetAttr("enable_low_precision_io", enable_low_precision_io); @@ -688,6 +688,16 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( op_desc->SetAttr("context_memory_sharing", context_memory_sharing); std::string trt_engine_serialized_data; op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data); + + // serialization engine info + std::string engine_info_path; + if (inspector_serialize) { + engine_info_path = Get("model_opt_cache_dir") + + "engine_info_" + engine_key + ".json"; + LOG(INFO) << "Serialize engine info to " << engine_info_path; + } + op_desc->SetAttr("use_inspector", use_inspector); + op_desc->SetAttr("engine_info_path", engine_info_path); op_desc->Flush(); std::unique_ptr calibrator; @@ -739,6 +749,8 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( framework::ir::Agent(node).subgraph()->end()); framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); + // Adding new parameters must set a new op attribute by "op_desc->SetAttr()" + // first and syncing it to tensorrt_engine_op.h tensorrt::TensorRTEngine::ConstructionParams params; params.max_batch_size = max_batch_size; params.max_workspace_size = workspace_size; @@ -761,6 +773,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( params.tensorrt_transformer_maskid = tensorrt_transformer_maskid; params.context_memory_sharing = context_memory_sharing; params.use_inspector = use_inspector; + params.engine_info_path = engine_info_path; params.enable_low_precision_io = enable_low_precision_io; tensorrt::TensorRTEngine *trt_engine = diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index efd450f2bf5caa..8598805c01787c 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -478,6 +478,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(collect_shape_range_info_); CP_MEMBER(shape_range_info_path_); CP_MEMBER(trt_use_inspector_); + CP_MEMBER(trt_inspector_serialize_); CP_MEMBER(trt_use_explicit_quantization_); CP_MEMBER(trt_engine_memory_sharing_); CP_MEMBER(trt_engine_memory_sharing_identifier_); @@ -841,7 +842,10 @@ void AnalysisConfig::EnableTensorRtDLA(int dla_core) { trt_dla_core_ = dla_core; } -void AnalysisConfig::EnableTensorRtInspector() { trt_use_inspector_ = true; } +void AnalysisConfig::EnableTensorRtInspector(bool inspector_serialize) { + trt_use_inspector_ = true; + trt_inspector_serialize_ = inspector_serialize; +} void AnalysisConfig::EnableTensorRtExplicitQuantization() { trt_use_explicit_quantization_ = true; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 83f75c1ae07039..0f60f58c58adfe 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1405,6 +1405,7 @@ void AnalysisPredictor::PrepareArgument() { argument_->SetTensorRtAllowBuildAtRuntime( config_.trt_allow_build_at_runtime()); argument_->SetTensorRtUseInspector(config_.trt_use_inspector_); + argument_->SetTensorRtInspectorSerialize(config_.trt_inspector_serialize_); argument_->SetTensorRtUseExplicitQuantization( config_.trt_use_explicit_quantization_); argument_->SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing()); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 7348418d6e5629..807690b2f17b67 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -850,7 +850,7 @@ struct PD_INFER_DECL AnalysisConfig { /// /// \return bool Whether to show TensorRT inspector information. /// - void EnableTensorRtInspector(); + void EnableTensorRtInspector(bool inspector_serialize = false); bool tensorrt_inspector_enabled() { return trt_use_inspector_; } /// @@ -1253,6 +1253,7 @@ struct PD_INFER_DECL AnalysisConfig { // tune to get dynamic_shape info. bool trt_tuned_dynamic_shape_{false}; bool trt_use_inspector_{false}; + bool trt_inspector_serialize_{false}; bool trt_use_explicit_quantization_{false}; // In CollectShapeInfo mode, we will collect the shape information of diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 7803989aa38c5f..eb8f4ccb894a95 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -393,7 +393,7 @@ void TensorRTEngine::FreezeNetwork() { predictor_id_per_thread); } if (params_.use_inspector) { - GetEngineInfo(); + GetEngineInfo(params_.engine_info_path); } } @@ -608,7 +608,7 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) { predictor_id_per_thread); } if (params_.use_inspector) { - GetEngineInfo(); + GetEngineInfo(params_.engine_info_path); } } @@ -862,18 +862,34 @@ void TensorRTEngine::FreshDeviceId() { platform::SetDeviceId(device_id()); } -void TensorRTEngine::GetEngineInfo() { +void TensorRTEngine::GetEngineInfo(const std::string &engine_info_path) { #if IS_TRT_VERSION_GE(8200) - LOG(INFO) << "====== engine info ======"; std::unique_ptr infer_inspector( infer_engine_->createEngineInspector()); auto *infer_context = context(); infer_inspector->setExecutionContext(infer_context); - for (int i = 0; i < infer_engine_->getNbLayers(); ++i) { - LOG(INFO) << infer_inspector->getLayerInformation( - i, nvinfer1::LayerInformationFormat::kJSON); + if (engine_info_path.empty()) { + LOG(INFO) << "====== engine info ======"; + for (int i = 0; i < infer_engine_->getNbLayers(); ++i) { + LOG(INFO) << infer_inspector->getLayerInformation( + i, nvinfer1::LayerInformationFormat::kJSON); + } + LOG(INFO) << "====== engine info end ======"; + } else { + std::fstream out_file; + out_file.open(engine_info_path, std::ios_base::out); + out_file << "["; + for (int i = 0; i < infer_engine_->getNbLayers(); ++i) { + out_file << infer_inspector->getLayerInformation( + i, nvinfer1::LayerInformationFormat::kJSON) + << "\n"; + if (i != infer_engine_->getNbLayers() - 1) { + out_file << ","; + } + } + out_file << "]"; + out_file.close(); } - LOG(INFO) << "====== engine info end ======"; #else LOG(INFO) << "Inspector needs TensorRT version 8.2 and after."; #endif diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 3ad2ca30a4cff9..16f7e4903d0ce8 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -154,6 +154,7 @@ class TensorRTEngine { ShapeMapType optim_shape_tensor; bool use_inspector{false}; + std::string engine_info_path{""}; // // From tensorrt_subgraph_pass, only used for OpConverter. @@ -531,7 +532,7 @@ class TensorRTEngine { // FreshDeviceId(). void FreshDeviceId(); - void GetEngineInfo(); + void GetEngineInfo(const std::string& engine_info_path); int device_id() { return params_.device_id; } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 3ebf16410fe6cd..0d170eae31cfb1 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -841,6 +841,7 @@ class TensorRTEngineOp : public framework::OperatorBase { params.disable_trt_plugin_fp16 = Attr("disable_trt_plugin_fp16"); params.enable_low_precision_io = Attr("enable_low_precision_io"); params.use_inspector = Attr("use_inspector"); + params.engine_info_path = Attr("engine_info_path"); if (!shape_range_info_path_.empty()) { inference::DeserializeShapeRangeInfo(shape_range_info_path_, diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index f6c20061de0010..23ccf702685577 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -148,6 +148,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) { engine_op_desc.SetAttr("disable_trt_plugin_fp16", false); engine_op_desc.SetAttr("enable_low_precision_io", false); engine_op_desc.SetAttr("use_inspector", false); + engine_op_desc.SetAttr("engine_info_path", std::string("")); engine_op_desc.SetAttr("use_dla", false); engine_op_desc.SetAttr("dla_core", 0); @@ -297,6 +298,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("disable_trt_plugin_fp16", false); engine_op_desc.SetAttr("enable_low_precision_io", false); engine_op_desc.SetAttr("use_inspector", false); + engine_op_desc.SetAttr("engine_info_path", std::string("")); engine_op_desc.SetAttr("use_dla", false); engine_op_desc.SetAttr("dla_core", 0); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 1690d738a2c60d..aeef83c04f226a 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -919,7 +919,8 @@ void BindAnalysisConfig(py::module *m) { py::arg("dla_core") = 0) .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled) .def("enable_tensorrt_inspector", - &AnalysisConfig::EnableTensorRtInspector) + &AnalysisConfig::EnableTensorRtInspector, + py::arg("inspector_serialize") = false) .def("tensorrt_inspector_enabled", &AnalysisConfig::tensorrt_inspector_enabled) .def("enable_tensorrt_explicit_quantization", diff --git a/test/ir/inference/inference_pass_test.py b/test/ir/inference/inference_pass_test.py index 8d8a484f77cf12..48b1603728a1be 100644 --- a/test/ir/inference/inference_pass_test.py +++ b/test/ir/inference/inference_pass_test.py @@ -157,7 +157,9 @@ def _get_analysis_config( self.trt_parameters.use_calib_mode, ) if self.trt_parameters.use_inspector: - config.enable_tensorrt_inspector() + config.enable_tensorrt_inspector( + self.trt_parameters.inspector_serialize + ) self.assertTrue( config.tensorrt_inspector_enabled(), "The inspector option is not set correctly.", @@ -319,6 +321,7 @@ def __init__( use_static, use_calib_mode, use_inspector=False, + inspector_serialize=False, ): self.workspace_size = workspace_size self.max_batch_size = max_batch_size @@ -327,6 +330,7 @@ def __init__( self.use_static = use_static self.use_calib_mode = use_calib_mode self.use_inspector = use_inspector + self.inspector_serialize = inspector_serialize class DynamicShapeParam: ''' diff --git a/test/ir/inference/test_trt_inspector.py b/test/ir/inference/test_trt_inspector.py index 52d02fd1213cb4..ff118c2ee082bc 100644 --- a/test/ir/inference/test_trt_inspector.py +++ b/test/ir/inference/test_trt_inspector.py @@ -25,7 +25,7 @@ from paddle.base.core import AnalysisConfig -class TensorRTInspectorTest(InferencePassTest): +class TensorRTInspectorTest1(InferencePassTest): def setUp(self): self.set_params() with base.program_guard(self.main_program, self.startup_program): @@ -58,7 +58,7 @@ def set_params(self): def test_check_output(self): if core.is_compiled_with_cuda(): build_engine = subprocess.run( - [sys.executable, 'test_trt_inspector.py', '--build-engine'], + [sys.executable, 'test_trt_inspector.py', '--build-engine1'], stderr=subprocess.PIPE, ) engine_info = build_engine.stderr.decode('ascii') @@ -73,7 +73,65 @@ def test_check_output(self): self.assertTrue('====== engine info end ======' in engine_info) self.assertTrue('matmul' in engine_info) self.assertTrue('"LayerType": "Scale"' in engine_info) - self.assertTrue('batch_norm' in engine_info) + else: + self.assertTrue( + 'Inspector needs TensorRT version 8.2 and after.' + in engine_info + ) + + +class TensorRTInspectorTest2(InferencePassTest): + def setUp(self): + self.set_params() + with base.program_guard(self.main_program, self.startup_program): + data = paddle.static.data( + name="data", shape=[1, 16, 16], dtype="float32" + ) + matmul_out = paddle.matmul( + x=data, + y=data, + transpose_x=self.transpose_x, + transpose_y=self.transpose_y, + ) + matmul_out = paddle.scale(matmul_out, scale=self.alpha) + out = paddle.static.nn.batch_norm(matmul_out, is_test=True) + + self.feeds = { + "data": np.ones([1, 16, 16]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = InferencePassTest.TensorRTParam( + 1 << 30, + 1, + 0, + AnalysisConfig.Precision.Float32, + False, + False, + True, + True, + ) + self.fetch_list = [out] + + def set_params(self): + self.transpose_x = True + self.transpose_y = True + self.alpha = 2.0 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + build_engine = subprocess.run( + [sys.executable, 'test_trt_inspector.py', '--build-engine2'], + stderr=subprocess.PIPE, + ) + engine_info = build_engine.stderr.decode('ascii') + trt_compile_version = paddle.inference.get_trt_compile_version() + trt_runtime_version = paddle.inference.get_trt_runtime_version() + valid_version = (8, 2, 0) + if ( + trt_compile_version >= valid_version + and trt_runtime_version >= valid_version + ): + self.assertTrue('Serialize engine info to' in engine_info) else: self.assertTrue( 'Inspector needs TensorRT version 8.2 and after.' @@ -82,10 +140,15 @@ def test_check_output(self): if __name__ == "__main__": - if '--build-engine' in sys.argv: - test = TensorRTInspectorTest() - test.setUp() + if '--build-engine1' in sys.argv: + test1 = TensorRTInspectorTest1() + test1.setUp() + use_gpu = True + test1.check_output_with_option(use_gpu) + elif '--build-engine2' in sys.argv: + test2 = TensorRTInspectorTest2() + test2.setUp() use_gpu = True - test.check_output_with_option(use_gpu) + test2.check_output_with_option(use_gpu) else: unittest.main()