Skip to content

Commit

Permalink
fix(//core/partitioing): Fixing support for paritally compiling
Browse files Browse the repository at this point in the history
graphs with FP16 weights

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
  • Loading branch information
narendasan committed Oct 19, 2021
1 parent 8927e77 commit 748ecf3
Show file tree
Hide file tree
Showing 63 changed files with 791 additions and 593 deletions.
148 changes: 72 additions & 76 deletions core/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,22 +128,6 @@ bool CheckMethodOperatorSupport(const torch::jit::script::Module& mod, std::stri
return conversion::VerifyConverterSupportForBlock(g->block());
}

std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
// Go through Lowering to simplify graph and extract weight parameters
auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);

auto convert_cfg = std::move(cfg.convert_info);
auto g = graph_and_parameters.first;

auto params = graph_and_parameters.second;
auto named_params = conversion::get_named_params(g->inputs(), params);

LOG_INFO(*g << "(CompileGraph)\n");

auto engine = conversion::ConvertBlockToEngine(g->block(), convert_cfg, named_params);
return std::move(engine);
}

void AddSegmentedBlockToGraph(
std::shared_ptr<torch::jit::Graph>& g,
partitioning::SegmentedBlock& seg,
Expand Down Expand Up @@ -237,15 +221,15 @@ void AddIfBlockToGraph(
GraphAndMapping ConstructFallbackGraph(
torch::jit::script::Module& new_mod,
torch::jit::Block* block,
std::unordered_map<torch::jit::Value*, torch::jit::IValue> input_ivalues_map,
std::unordered_map<const torch::jit::Value*, torch::jit::IValue> example_tensor_map,
CompileSpec cfg,
conversion::GraphParams named_params) {
ir::StaticParams static_params) {
auto convert_cfg = cfg.convert_info;
auto partition_info = cfg.partition_info;

auto new_g = std::make_shared<torch::jit::Graph>();

auto segmented_blocks = partitioning::Partition(block, input_ivalues_map, partition_info);
auto segmented_blocks = partitioning::Partition(block, example_tensor_map, partition_info);

// the mapping from lowering graph => fallback global graph
std::unordered_map<torch::jit::Value*, torch::jit::Value*> old_to_new_g;
Expand All @@ -259,13 +243,17 @@ GraphAndMapping ConstructFallbackGraph(
trt_engine_id << reinterpret_cast<const int*>(&seg_block);

if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) {
auto shapes = seg_block.in_shapes();
auto types = seg_block.in_types();
std::vector<ir::Input> inputs;
for (auto& shape : seg_block.in_shape()) {
inputs.push_back(ir::Input(shape));
for (size_t i = 0; i < shapes.size(); i++) {
auto in = ir::Input(shapes[i]);
in.dtype = util::ScalarTypeToTRTDataType(types[i]);
inputs.push_back(in);
}
// update the input ranges for each segments
convert_cfg.inputs = inputs;
auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params);
convert_cfg.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params);
auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, static_params);
auto temp_g = std::make_shared<torch::jit::Graph>();
auto device_spec = convert_cfg.engine_settings.device;
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
Expand All @@ -281,7 +269,7 @@ GraphAndMapping ConstructFallbackGraph(
std::vector<GraphAndMapping> graph_and_mappings;
for (auto cur_block : if_node->blocks()) {
graph_and_mappings.push_back(
ConstructFallbackGraph(new_mod, cur_block, input_ivalues_map, cfg, named_params));
ConstructFallbackGraph(new_mod, cur_block, example_tensor_map, cfg, static_params));
}
AddIfBlockToGraph(new_g, if_node, graph_and_mappings, old_to_new_g);

Expand All @@ -299,54 +287,28 @@ GraphAndMapping ConstructFallbackGraph(
return {new_g, old_to_new_g};
}

torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Module& mod, CompileSpec cfg) {
// TODO: Should be doing a functional transform but need PR #31978
// [jit] More robust mangling
// torch::jit::script::Module new_mod = mod.clone();
torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
for (const torch::jit::script::Method& method : mod.get_methods()) {
// Compile only forward methods. forward method contains the entire graph.
if (method.name().compare("forward") == 0) {
auto new_g = std::make_shared<torch::jit::Graph>();
auto graph_and_parameters = lowering::Lower(mod, method.name(), cfg.lower_info);
std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
// Go through Lowering to simplify graph and extract weight parameters
auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);

auto g = graph_and_parameters.first;
auto params = graph_and_parameters.second;
auto named_params = conversion::get_named_params(g->inputs(), params);
LOG_INFO("(LoweredGraph)\n" << *g);
auto convert_cfg = std::move(cfg.convert_info);
auto g = graph_and_parameters.first;

std::unordered_map<torch::jit::Value*, ir::Input> inputs;
for (size_t i = 0; i < g->inputs().size(); ++i) {
inputs.insert({g->inputs()[i], cfg.convert_info.inputs[i]});
}
auto input_ivalues_map = partitioning::generateRandomInputs(inputs);
auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, named_params);
new_g = graph_and_mapping.first;
LOG_INFO("(FallbackGraph)\n" << *new_g);
auto params = graph_and_parameters.second;
auto static_params = ir::get_static_params(g->inputs(), params);

// if there is no tensorrt engine self in fallback graph, there is no conversion, we just return the initial
// module
if (new_g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) {
LOG_WARNING("Didn't generate any TensorRT engines, the compiler did nothing\n");
return mod;
}
LOG_INFO(*g << "(CompileGraph)\n");

auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
new_mod.type()->addMethod(new_method);
new_method->setSchema(schema);
}
}
// Move the user defined inputs to the convert_cfg since some might be static;
convert_cfg.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));

return new_mod;
auto engine = conversion::ConvertBlockToEngine(g->block(), convert_cfg, static_params);
return std::move(engine);
}

torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, CompileSpec cfg) {
// TODO: not sure how to deal with duplicated code here, so just cut out a branch temporally
if (cfg.partition_info.enabled) {
return CompileGraphWithFallback(mod, cfg);
}
torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) {
torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");

auto device_spec = cfg.convert_info.engine_settings.device;

// GPU default WS size : 1 GB
Expand All @@ -362,25 +324,59 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
}
}

// TODO: Should be doing a functional transform but need PR #31978
// [jit] More robust mangling
// torch::jit::script::Module new_mod = mod.clone();
torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
for (const torch::jit::script::Method& method : mod.get_methods()) {
// Compile only forward methods. forward method contains the entire graph.
for (const torch::jit::Method& method : mod.get_methods()) {
if (method.name().compare("forward") == 0) {
auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg);
auto new_g = std::make_shared<torch::jit::Graph>();
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
AddEngineToGraph(new_mod, new_g, engine, cuda_device);

auto graph_and_parameters = lowering::Lower(mod, method.name(), cfg.lower_info);

auto g = graph_and_parameters.first;
LOG_INFO("Lowered Graph: " << *g);
auto params = graph_and_parameters.second;
auto static_params = ir::get_static_params(g->inputs(), params);

cfg.convert_info.inputs = std::move(ir::associate_specs_with_inputs(g, cfg.inputs, static_params));

// If the user did not explicitly set the input type, then use the first
// tensor calculation to infer type.
auto first_use_types = util::get_block_first_calc_dtypes_opt(g->block());
for (auto& in : g->inputs()) {
auto est_type_opt = first_use_types[in];
ir::Input& spec = cfg.convert_info.inputs.find(in)->second;
if (est_type_opt && !spec.dtype_is_user_defined) {
spec.dtype = util::ScalarTypeToTRTDataType(est_type_opt.value());
} else if (!est_type_opt && !spec.dtype_is_user_defined) {
LOG_WARNING(
"Cannot deterime input type from calcuations in graph for input "
<< in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
spec.dtype = nvinfer1::DataType::kFLOAT;
}
}

if (cfg.partition_info.enabled) {
auto input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types);
auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params);
new_g = graph_and_mapping.first;
LOG_INFO("Segmented Graph: " << *new_g);

// if there is no tensorrt engine self in fallback graph, there is no conversion, we just return the initial
// module
if (new_g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) {
LOG_WARNING("Didn't generate any TensorRT engines, the compiler did nothing\n");
return mod;
}
} else {
auto engine = conversion::ConvertBlockToEngine(g->block(), cfg.convert_info, static_params);
auto device_spec = cfg.convert_info.engine_settings.device;
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
AddEngineToGraph(new_mod, new_g, engine, cuda_device);
}
auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
new_mod.type()->addMethod(new_method);
new_method->setSchema(schema);
}
}

return new_mod;
}

Expand Down
3 changes: 2 additions & 1 deletion core/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ namespace trtorch {
namespace core {

struct CompileSpec {
CompileSpec(std::vector<ir::Input> inputs) : convert_info(std::move(inputs)) {}
CompileSpec(std::vector<ir::Input> inputs) : inputs(inputs) {}
std::vector<ir::Input> inputs;
conversion::ConversionInfo convert_info;
lowering::LowerInfo lower_info;
partitioning::PartitionInfo partition_info;
Expand Down
1 change: 0 additions & 1 deletion core/conversion/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ config_setting(
cc_library(
name = "conversion",
srcs = [
"InterfaceTypes.cpp",
"conversion.cpp",
"conversion_ignorelist.cpp",
],
Expand Down
41 changes: 23 additions & 18 deletions core/conversion/conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,10 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) {
<< "please report this error to https://www.github.com/NVIDIA/TRTorch/issues");
}

void AddInputs(ConversionCtx* ctx, at::ArrayRef<const torch::jit::Value*> inputs, std::vector<ir::Input>& input_specs) {
void AddInputs(
ConversionCtx* ctx,
c10::ArrayRef<const torch::jit::Value*> inputs,
std::unordered_map<const torch::jit::Value*, ir::Input>& input_specs) {
std::vector<const torch::jit::Value*> input_tensors;
for (auto in : inputs) {
// Disregarding inputs that are not tensors
Expand All @@ -143,24 +146,23 @@ void AddInputs(ConversionCtx* ctx, at::ArrayRef<const torch::jit::Value*> inputs
}

std::stringstream ss;
ss << "Input Dimension Specs: [\n";
ss << "Input Dimension Specs: {" << std::endl;
for (auto i : input_specs) {
ss << " " << i << ",";
ss << " " << i.first->debugName() << " : " << i.second << ",";
}
ss << ']';
LOG_DEBUG(ctx->logger, ss.str());

TRTORCH_CHECK(
input_tensors.size() == input_specs.size(),
"Expected dimension specifications for all input tensors"
<< ", but found " << input_tensors.size() << " input tensors and " << input_specs.size()
<< " dimension specs (conversion.AddInputs)");
ss << '}';
auto dbg_str = ss.str();
LOG_DEBUG(ctx->logger, dbg_str);

auto profile = ctx->builder->createOptimizationProfile();

for (size_t i = 0; i < input_tensors.size(); i++) {
auto in = input_tensors[i];
auto spec = input_specs[i];
for (auto input : input_tensors) {
const torch::jit::Value* in = input;
TRTORCH_CHECK(
input_specs.find(in) != input_specs.end(),
"Cannot find an input spec associated with input: " << in->debugName());
ir::Input& spec = input_specs.find(in)->second;

std::string name = std::string("input_") + std::to_string(ctx->num_inputs);
LOG_INFO(
ctx->logger,
Expand Down Expand Up @@ -226,7 +228,7 @@ void MarkOutputs(ConversionCtx* ctx, at::ArrayRef<const torch::jit::Value*> outp
}
}

void AddParamsToCtxValueMap(ConversionCtx* ctx, GraphParams& params) {
void AddParamsToCtxValueMap(ConversionCtx* ctx, ir::StaticParams& params) {
for (auto p : params) {
ctx->evaluated_value_map[p.first] = std::move(p.second);
}
Expand Down Expand Up @@ -358,8 +360,8 @@ void EvaluateLoopBlock(ConversionCtx* ctx, const torch::jit::Node* n) {
void ConvertBlockToNetDef(
ConversionCtx* ctx,
const torch::jit::Block* b,
ConversionInfo build_info,
GraphParams& static_params) {
ConversionInfo& build_info,
ir::StaticParams& static_params) {
LOG_INFO(ctx->logger, "Converting Block");

auto inputs = b->inputs();
Expand Down Expand Up @@ -435,7 +437,10 @@ void ConvertBlockToNetDef(
// a serialized TensorRT engine that can be deserialized and run

// Probably should consolidate these two functions
std::string ConvertBlockToEngine(const torch::jit::Block* b, ConversionInfo build_info, GraphParams& static_params) {
std::string ConvertBlockToEngine(
const torch::jit::Block* b,
ConversionInfo build_info,
ir::StaticParams& static_params) {
ConversionCtx ctx(build_info.engine_settings);
ConvertBlockToNetDef(&ctx, b, build_info, static_params);
std::string engine = ctx.SerializeEngine();
Expand Down
14 changes: 5 additions & 9 deletions core/conversion/conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,16 @@ namespace core {
namespace conversion {

struct ConversionInfo {
std::vector<ir::Input> inputs;
ir::InputSpecMap inputs;
BuilderSettings engine_settings;
ConversionInfo(std::vector<ir::Input> inputs) : inputs(std::move(inputs)), engine_settings(BuilderSettings()) {}
};

// TODO: REMOVE GRAPH AND PARAMS AND MOVE FULLY TO INLINED CONSTANTS

using GraphParams = std::map<torch::jit::Value*, torch::jit::IValue>;

GraphParams get_named_params(c10::ArrayRef<torch::jit::Value*> inputs, std::vector<torch::jit::IValue> params);

// Converts a already lowered block (blocks with no sub blocks) to
// a serialized TensorRT engine that can be deserialized and run
std::string ConvertBlockToEngine(const torch::jit::Block* b, ConversionInfo build_info, GraphParams& static_params);
std::string ConvertBlockToEngine(
const torch::jit::Block* b,
ConversionInfo build_info,
ir::StaticParams& static_params);

bool OpSupported(const torch::jit::Node* n);

Expand Down
4 changes: 3 additions & 1 deletion core/ir/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ cc_library(
"ir.h"
],
srcs = [
"Input.cpp"
"ir.cpp",
"Input.cpp",
"StaticParams.cpp"
],
deps = [
"@tensorrt//:nvinfer",
Expand Down
11 changes: 9 additions & 2 deletions core/ir/Input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ bool valid_input_dtype(nvinfer1::DataType dtype) {
}
}

Input::Input(std::vector<int64_t> shape, nvinfer1::DataType dtype, nvinfer1::TensorFormat format) {
Input::Input(
std::vector<int64_t> shape,
nvinfer1::DataType dtype,
nvinfer1::TensorFormat format,
bool dtype_is_user_defined) {
if (shape.size() > 5) {
LOG_WARNING("Verify that this dim size is accepted");
}
Expand All @@ -81,14 +85,16 @@ Input::Input(std::vector<int64_t> shape, nvinfer1::DataType dtype, nvinfer1::Ten
<< dtype << ", " << format
<< "), TRTorch only supports contiguous format (NCHW) except with input type Float32 where channel last (NHWC) is also supported");
this->format = format;
this->dtype_is_user_defined = dtype_is_user_defined;
}

Input::Input(
std::vector<int64_t> min_shape,
std::vector<int64_t> opt_shape,
std::vector<int64_t> max_shape,
nvinfer1::DataType dtype,
nvinfer1::TensorFormat format) {
nvinfer1::TensorFormat format,
bool dtype_is_user_defined) {
if (min_shape.size() > 5 || opt_shape.size() > 5 || max_shape.size() > 5) {
LOG_WARNING("Verify that this dim size is accepted");
}
Expand Down Expand Up @@ -132,6 +138,7 @@ Input::Input(
<< dtype << ", " << format
<< "), TRTorch only supports contiguous format (NCHW) except with input type Float32 where channel last (NHWC) is also supported");
this->format = format;
this->dtype_is_user_defined = dtype_is_user_defined;
}

std::ostream& operator<<(std::ostream& os, const Input& input) {
Expand Down
Loading

0 comments on commit 748ecf3

Please sign in to comment.