Merge pull request #658 from guoruoqian/fix_runtime_thread_safety

narendasan · web-flow · commit b47e926c9a1a · 2021-10-14T15:47:18.000-04:00
Make TRTorch runtime thread safe
diff --git a/core/runtime/register_trt_op.cpp b/core/runtime/register_trt_op.cpp
@@ -112,6 +112,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   }
 
   c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
+
+  // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex for it.
+  std::unique_lock<std::mutex> lock(compiled_engine->mu);
   compiled_engine->exec_ctx->enqueueV2(gpu_handles.data(), stream, nullptr);
 
   return outputs;
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <map>
 #include <memory>
+#include <mutex>
 #include <utility>
 #include "ATen/core/function_schema.h"
 #include "NvInfer.h"
@@ -47,6 +48,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
   std::pair<uint64_t, uint64_t> num_io;
   std::string name;
+  std::mutex mu;
   CudaDevice device_info;
 
   std::unordered_map<uint64_t, uint64_t> in_binding_map;
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
@@ -14,6 +14,7 @@ test_suite(
         ":test_default_input_types",
         ":test_compiled_modules",
         ":test_modules_as_engines",
+        ":test_runtime_thread_safety",
         ":test_multiple_registered_engines",
         ":test_serialization",
         ":test_module_fallback",
@@ -27,6 +28,7 @@ test_suite(
         ":test_default_input_types",
         ":test_compiled_modules",
         ":test_modules_as_engines",
+        ":test_runtime_thread_safety",
         ":test_multiple_registered_engines",
         ":test_serialization",
         ":test_module_fallback",
@@ -95,6 +97,17 @@ cc_test(
     timeout="long"
 )
 
+cc_test(
+    name = "test_runtime_thread_safety",
+    srcs = ["test_runtime_thread_safety.cpp"],
+    data = [
+        "//tests/modules:jit_models",
+    ],
+    deps = [
+        ":cpp_api_test",
+    ]
+)
+
 cc_test(
     name = "test_module_fallback",
     srcs = ["test_module_fallback.cpp"],
diff --git a/tests/cpp/test_runtime_thread_safety.cpp b/tests/cpp/test_runtime_thread_safety.cpp
@@ -0,0 +1,83 @@
+#include <string>
+#include <thread>
+#include "gtest/gtest.h"
+#include "tests/util/util.h"
+#include "torch/script.h"
+#include "trtorch/trtorch.h"
+
+void run_infer(
+    int thread_id,
+    torch::jit::Module& mod,
+    torch::jit::Module& trt_mod,
+    const std::vector<torch::jit::IValue> inputs,
+    const std::vector<torch::jit::IValue> inputs_trt,
+    std::vector<torch::jit::IValue>& out_vec,
+    std::vector<torch::jit::IValue>& trt_out_vec) {
+  int count = 10;
+  while (count-- > 0) {
+    out_vec[thread_id] = mod.forward(inputs);
+    trt_out_vec[thread_id] = trt_mod.forward(inputs_trt);
+  }
+}
+
+TEST(CppAPITests, RuntimeThreadSafety) {
+  std::string path = "tests/modules/resnet50_traced.jit.pt";
+  torch::jit::Module mod;
+  try {
+    // Deserialize the ScriptModule from a file using torch::jit::load().
+    mod = torch::jit::load(path);
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+  }
+  mod.eval();
+  mod.to(torch::kCUDA);
+
+  torch::Tensor in_jit = at::randint(5, {1, 3, 224, 224}, torch::kCUDA).to(torch::kFloat);
+  torch::Tensor in_trt = in_jit.clone().to(torch::kFloat);
+
+  std::vector<torch::jit::IValue> inputs_jit;
+  std::vector<torch::jit::IValue> inputs_trt;
+  inputs_jit.push_back(in_jit.clone());
+  inputs_trt.push_back(in_trt.clone());
+
+  std::vector<trtorch::CompileSpec::Input> input_ranges;
+  for (auto in : inputs_trt) {
+    input_ranges.push_back({std::vector<int64_t>{1, 3, 224, 224},
+                            std::vector<int64_t>{1, 3, 224, 224},
+                            std::vector<int64_t>{16, 3, 224, 224},
+                            torch::kFloat});
+  }
+  auto compile_settings = trtorch::CompileSpec(input_ranges);
+
+  // FP32 execution
+  compile_settings.enabled_precisions = {torch::kFloat};
+  compile_settings.strict_types = true;
+  auto trt_mod = trtorch::CompileGraph(mod, compile_settings);
+  std::cout << "trtorch::CompileGraph" << std::endl;
+
+  int num_threads = 10;
+  std::vector<torch::jit::IValue> out_vec(num_threads), trt_out_vec(num_threads);
+  std::vector<std::thread> threads;
+  for (int i = 0; i < num_threads; i++) {
+    threads.push_back(std::thread(
+        run_infer,
+        i,
+        std::ref(mod),
+        std::ref(trt_mod),
+        inputs_jit,
+        inputs_trt,
+        std::ref(out_vec),
+        std::ref(trt_out_vec)));
+  }
+
+  for (int i = 0; i < num_threads; i++) {
+    threads[i].join();
+  }
+
+  bool flag = true;
+  for (int i = 0; i < num_threads; i++) {
+    bool f = trtorch::tests::util::almostEqual(out_vec[i].toTensor(), trt_out_vec[i].toTensor(), 1e-2);
+    flag = flag && f;
+  }
+  ASSERT_TRUE(flag);
+}

Original file line number	Diff line number	Diff line change
`@@ -112,6 +112,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr`
`112`	`112`	`}`
`113`	`113`
`114`	`114`	`c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());`
	`115`	`+`
	`116`	`+ // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex for it.`
	`117`	`+ std::unique_lock<std::mutex> lock(compiled_engine->mu);`
`115`	`118`	`compiled_engine->exec_ctx->enqueueV2(gpu_handles.data(), stream, nullptr);`
`116`	`119`
`117`	`120`	`return outputs;`