Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modify API for serializing engine info #56952

Merged
merged 12 commits into from
Sep 18, 2023
3 changes: 3 additions & 0 deletions paddle/fluid/inference/analysis/argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,9 @@ struct Argument {
TensorRtAllowBuildAtRuntime,
bool);
DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool);
DECL_ARGUMENT_FIELD(tensorrt_inspector_serialize,
TensorRtInspectorSerialize,
bool);
DECL_ARGUMENT_FIELD(tensorrt_use_explicit_quantization,
TensorRtUseExplicitQuantization,
bool);
Expand Down
6 changes: 5 additions & 1 deletion paddle/fluid/inference/analysis/ir_pass_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("use_cuda_graph",
new bool(argument->tensorrt_use_cuda_graph()));
bool use_static_engine = argument->tensorrt_use_static_engine();
bool inspector_serialize = argument->tensorrt_inspector_serialize();
bool model_from_memory = argument->model_from_memory();
std::string optim_cache_dir = argument->optim_cache_dir();
bool int8_valid = !(model_from_memory && optim_cache_dir.empty() &&
Expand Down Expand Up @@ -212,7 +213,8 @@ void IRPassManager::CreatePasses(Argument *argument,
optim_cache_dir));
}
pass->Set("model_opt_cache_dir", new std::string(optim_cache_dir));
} else if (use_static_engine || enable_int8 || with_dynamic_shape) {
} else if (use_static_engine || enable_int8 || with_dynamic_shape ||
inspector_serialize) {
std::string model_opt_cache_dir =
argument->Has("model_dir")
? argument->model_dir()
Expand All @@ -224,6 +226,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("use_static_engine", new bool(use_static_engine));
pass->Set("model_from_memory", new bool(argument->model_from_memory()));
pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector()));
pass->Set("inspector_serialize",
new bool(argument->tensorrt_inspector_serialize()));

// tuned trt dynamic_shape
pass->Set("trt_shape_range_info_path",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
auto use_dla = Get<bool>("trt_use_dla");
auto dla_core = Get<int>("trt_dla_core");
auto use_inspector = Get<bool>("use_inspector");
auto inspector_serialize = Get<bool>("inspector_serialize");
auto disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
auto context_memory_sharing = Get<bool>("context_memory_sharing");
auto enable_low_precision_io = Get<bool>("enable_low_precision_io");
Expand All @@ -592,7 +593,6 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
op_desc->SetAttr("parameters", parameters);
op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
op_desc->SetAttr("use_inspector", use_inspector);
op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape);
op_desc->SetAttr("enable_low_precision_io", enable_low_precision_io);

Expand Down Expand Up @@ -688,6 +688,16 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
op_desc->SetAttr("context_memory_sharing", context_memory_sharing);
std::string trt_engine_serialized_data;
op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data);

// serialization engine info
std::string engine_info_path;
if (inspector_serialize) {
engine_info_path = Get<std::string>("model_opt_cache_dir") +
"engine_info_" + engine_key + ".json";
LOG(INFO) << "Serialize engine info to " << engine_info_path;
}
op_desc->SetAttr("use_inspector", use_inspector);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use_inspector其他位置是不是已经set过了

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

从之前的位置移动到这里,与inspector_serialize的功能放在一起

op_desc->SetAttr("engine_info_path", engine_info_path);
op_desc->Flush();

std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
Expand Down Expand Up @@ -739,6 +749,8 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
framework::ir::Agent(node).subgraph()->end());
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);

// Adding new parameters must set a new op attribute by "op_desc->SetAttr()"
// first and syncing it to tensorrt_engine_op.h
tensorrt::TensorRTEngine::ConstructionParams params;
params.max_batch_size = max_batch_size;
params.max_workspace_size = workspace_size;
Expand All @@ -761,6 +773,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
params.tensorrt_transformer_maskid = tensorrt_transformer_maskid;
params.context_memory_sharing = context_memory_sharing;
params.use_inspector = use_inspector;
params.engine_info_path = engine_info_path;
params.enable_low_precision_io = enable_low_precision_io;

tensorrt::TensorRTEngine *trt_engine =
Expand Down
6 changes: 5 additions & 1 deletion paddle/fluid/inference/api/analysis_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(collect_shape_range_info_);
CP_MEMBER(shape_range_info_path_);
CP_MEMBER(trt_use_inspector_);
CP_MEMBER(trt_inspector_serialize_);
CP_MEMBER(trt_use_explicit_quantization_);
CP_MEMBER(trt_engine_memory_sharing_);
CP_MEMBER(trt_engine_memory_sharing_identifier_);
Expand Down Expand Up @@ -841,7 +842,10 @@ void AnalysisConfig::EnableTensorRtDLA(int dla_core) {
trt_dla_core_ = dla_core;
}

void AnalysisConfig::EnableTensorRtInspector() { trt_use_inspector_ = true; }
void AnalysisConfig::EnableTensorRtInspector(bool inspector_serialize) {
trt_use_inspector_ = true;
trt_inspector_serialize_ = inspector_serialize;
}

void AnalysisConfig::EnableTensorRtExplicitQuantization() {
trt_use_explicit_quantization_ = true;
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1405,6 +1405,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetTensorRtAllowBuildAtRuntime(
config_.trt_allow_build_at_runtime());
argument_->SetTensorRtUseInspector(config_.trt_use_inspector_);
argument_->SetTensorRtInspectorSerialize(config_.trt_inspector_serialize_);
argument_->SetTensorRtUseExplicitQuantization(
config_.trt_use_explicit_quantization_);
argument_->SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing());
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/inference/api/paddle_analysis_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@ struct PD_INFER_DECL AnalysisConfig {
///
/// \return bool Whether to show TensorRT inspector information.
///
void EnableTensorRtInspector();
void EnableTensorRtInspector(bool inspector_serialize = false);
bool tensorrt_inspector_enabled() { return trt_use_inspector_; }

///
Expand Down Expand Up @@ -1253,6 +1253,7 @@ struct PD_INFER_DECL AnalysisConfig {
// tune to get dynamic_shape info.
bool trt_tuned_dynamic_shape_{false};
bool trt_use_inspector_{false};
bool trt_inspector_serialize_{false};
bool trt_use_explicit_quantization_{false};

// In CollectShapeInfo mode, we will collect the shape information of
Expand Down
32 changes: 24 additions & 8 deletions paddle/fluid/inference/tensorrt/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ void TensorRTEngine::FreezeNetwork() {
predictor_id_per_thread);
}
if (params_.use_inspector) {
GetEngineInfo();
GetEngineInfo(params_.engine_info_path);
}
}

Expand Down Expand Up @@ -608,7 +608,7 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
predictor_id_per_thread);
}
if (params_.use_inspector) {
GetEngineInfo();
GetEngineInfo(params_.engine_info_path);
}
}

Expand Down Expand Up @@ -862,18 +862,34 @@ void TensorRTEngine::FreshDeviceId() {
platform::SetDeviceId(device_id());
}

void TensorRTEngine::GetEngineInfo() {
void TensorRTEngine::GetEngineInfo(const std::string &engine_info_path) {
#if IS_TRT_VERSION_GE(8200)
LOG(INFO) << "====== engine info ======";
std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
infer_engine_->createEngineInspector());
auto *infer_context = context();
infer_inspector->setExecutionContext(infer_context);
for (int i = 0; i < infer_engine_->getNbLayers(); ++i) {
LOG(INFO) << infer_inspector->getLayerInformation(
i, nvinfer1::LayerInformationFormat::kJSON);
if (engine_info_path.empty()) {
LOG(INFO) << "====== engine info ======";
for (int i = 0; i < infer_engine_->getNbLayers(); ++i) {
LOG(INFO) << infer_inspector->getLayerInformation(
i, nvinfer1::LayerInformationFormat::kJSON);
}
LOG(INFO) << "====== engine info end ======";
} else {
std::fstream out_file;
out_file.open(engine_info_path, std::ios_base::out);
out_file << "[";
for (int i = 0; i < infer_engine_->getNbLayers(); ++i) {
out_file << infer_inspector->getLayerInformation(
i, nvinfer1::LayerInformationFormat::kJSON)
<< "\n";
if (i != infer_engine_->getNbLayers() - 1) {
out_file << ",";
}
}
out_file << "]";
out_file.close();
}
LOG(INFO) << "====== engine info end ======";
#else
LOG(INFO) << "Inspector needs TensorRT version 8.2 and after.";
#endif
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/inference/tensorrt/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ class TensorRTEngine {
ShapeMapType optim_shape_tensor;

bool use_inspector{false};
std::string engine_info_path{""};

//
// From tensorrt_subgraph_pass, only used for OpConverter.
Expand Down Expand Up @@ -531,7 +532,7 @@ class TensorRTEngine {
// FreshDeviceId().
void FreshDeviceId();

void GetEngineInfo();
void GetEngineInfo(const std::string& engine_info_path);

int device_id() { return params_.device_id; }

Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
params.disable_trt_plugin_fp16 = Attr<bool>("disable_trt_plugin_fp16");
params.enable_low_precision_io = Attr<bool>("enable_low_precision_io");
params.use_inspector = Attr<bool>("use_inspector");
params.engine_info_path = Attr<std::string>("engine_info_path");

if (!shape_range_info_path_.empty()) {
inference::DeserializeShapeRangeInfo(shape_range_info_path_,
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
engine_op_desc.SetAttr("disable_trt_plugin_fp16", false);
engine_op_desc.SetAttr("enable_low_precision_io", false);
engine_op_desc.SetAttr("use_inspector", false);
engine_op_desc.SetAttr("engine_info_path", std::string(""));
engine_op_desc.SetAttr("use_dla", false);
engine_op_desc.SetAttr("dla_core", 0);

Expand Down Expand Up @@ -297,6 +298,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc.SetAttr("disable_trt_plugin_fp16", false);
engine_op_desc.SetAttr("enable_low_precision_io", false);
engine_op_desc.SetAttr("use_inspector", false);
engine_op_desc.SetAttr("engine_info_path", std::string(""));
engine_op_desc.SetAttr("use_dla", false);
engine_op_desc.SetAttr("dla_core", 0);

Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/pybind/inference_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,8 @@ void BindAnalysisConfig(py::module *m) {
py::arg("dla_core") = 0)
.def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
.def("enable_tensorrt_inspector",
&AnalysisConfig::EnableTensorRtInspector)
&AnalysisConfig::EnableTensorRtInspector,
py::arg("inspector_serialize") = false)
.def("tensorrt_inspector_enabled",
&AnalysisConfig::tensorrt_inspector_enabled)
.def("enable_tensorrt_explicit_quantization",
Expand Down
6 changes: 5 additions & 1 deletion test/ir/inference/inference_pass_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,9 @@ def _get_analysis_config(
self.trt_parameters.use_calib_mode,
)
if self.trt_parameters.use_inspector:
config.enable_tensorrt_inspector()
config.enable_tensorrt_inspector(
self.trt_parameters.inspector_serialize
)
self.assertTrue(
config.tensorrt_inspector_enabled(),
"The inspector option is not set correctly.",
Expand Down Expand Up @@ -319,6 +321,7 @@ def __init__(
use_static,
use_calib_mode,
use_inspector=False,
inspector_serialize=False,
):
self.workspace_size = workspace_size
self.max_batch_size = max_batch_size
Expand All @@ -327,6 +330,7 @@ def __init__(
self.use_static = use_static
self.use_calib_mode = use_calib_mode
self.use_inspector = use_inspector
self.inspector_serialize = inspector_serialize

class DynamicShapeParam:
'''
Expand Down
77 changes: 70 additions & 7 deletions test/ir/inference/test_trt_inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from paddle.base.core import AnalysisConfig


class TensorRTInspectorTest(InferencePassTest):
class TensorRTInspectorTest1(InferencePassTest):
def setUp(self):
self.set_params()
with base.program_guard(self.main_program, self.startup_program):
Expand Down Expand Up @@ -58,7 +58,7 @@ def set_params(self):
def test_check_output(self):
if core.is_compiled_with_cuda():
build_engine = subprocess.run(
[sys.executable, 'test_trt_inspector.py', '--build-engine'],
[sys.executable, 'test_trt_inspector.py', '--build-engine1'],
stderr=subprocess.PIPE,
)
engine_info = build_engine.stderr.decode('ascii')
Expand All @@ -73,7 +73,65 @@ def test_check_output(self):
self.assertTrue('====== engine info end ======' in engine_info)
self.assertTrue('matmul' in engine_info)
self.assertTrue('"LayerType": "Scale"' in engine_info)
self.assertTrue('batch_norm' in engine_info)
else:
self.assertTrue(
'Inspector needs TensorRT version 8.2 and after.'
in engine_info
)


class TensorRTInspectorTest2(InferencePassTest):
def setUp(self):
self.set_params()
with base.program_guard(self.main_program, self.startup_program):
data = paddle.static.data(
name="data", shape=[1, 16, 16], dtype="float32"
)
matmul_out = paddle.matmul(
x=data,
y=data,
transpose_x=self.transpose_x,
transpose_y=self.transpose_y,
)
matmul_out = paddle.scale(matmul_out, scale=self.alpha)
out = paddle.static.nn.batch_norm(matmul_out, is_test=True)

self.feeds = {
"data": np.ones([1, 16, 16]).astype("float32"),
}
self.enable_trt = True
self.trt_parameters = InferencePassTest.TensorRTParam(
1 << 30,
1,
0,
AnalysisConfig.Precision.Float32,
False,
False,
True,
True,
)
self.fetch_list = [out]

def set_params(self):
self.transpose_x = True
self.transpose_y = True
self.alpha = 2.0

def test_check_output(self):
if core.is_compiled_with_cuda():
build_engine = subprocess.run(
[sys.executable, 'test_trt_inspector.py', '--build-engine2'],
stderr=subprocess.PIPE,
)
engine_info = build_engine.stderr.decode('ascii')
trt_compile_version = paddle.inference.get_trt_compile_version()
trt_runtime_version = paddle.inference.get_trt_runtime_version()
valid_version = (8, 2, 0)
if (
trt_compile_version >= valid_version
and trt_runtime_version >= valid_version
):
self.assertTrue('Serialize engine info to' in engine_info)
else:
self.assertTrue(
'Inspector needs TensorRT version 8.2 and after.'
Expand All @@ -82,10 +140,15 @@ def test_check_output(self):


if __name__ == "__main__":
if '--build-engine' in sys.argv:
test = TensorRTInspectorTest()
test.setUp()
if '--build-engine1' in sys.argv:
test1 = TensorRTInspectorTest1()
test1.setUp()
use_gpu = True
test1.check_output_with_option(use_gpu)
elif '--build-engine2' in sys.argv:
test2 = TensorRTInspectorTest2()
test2.setUp()
use_gpu = True
test.check_output_with_option(use_gpu)
test2.check_output_with_option(use_gpu)
else:
unittest.main()