PaddlePaddle · jiweibo · May 12, 2023 · Feb 23, 2023 · Mar 9, 2023 · Mar 9, 2023
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -94,6 +94,61 @@
 #endif
 
 namespace paddle {
+namespace {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
+                                GPUContextResource *gpu_resource,
+                                Place place_) {
+  gpu_context->SetAllocator(memory::allocation::AllocatorFacade::Instance()
+                                .GetAllocator(place_, gpu_resource->GetStream())
+                                .get());
+  gpu_context->SetPinnedAllocator(
+      memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPinnedPlace())
+          .get());
+  gpu_context->SetHostAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetAllocator(platform::CPUPlace())
+                                    .get());
+  gpu_context->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetZeroAllocator(place_)
+                                    .get());
+  gpu_context->SetHostZeroAllocator(
+      memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(platform::CPUPlace())
+          .get());
+  gpu_context->SetGenerator(
+      phi::DefaultCUDAGenerator(place_.GetDeviceId()).get());
+  gpu_context->SetHostGenerator(phi::DefaultCPUGenerator().get());
+
+  gpu_context->SetStream(gpu_resource->GetStream());
+  gpu_context->SetBlasHandle(gpu_resource->GetBlasHandleCreator());
+  gpu_context->SetBlasTensorCoreHandle(
+      gpu_resource->GetBlasTensorCoreHandleCreator());
+  gpu_context->SetBlasTF32Handle(
+      gpu_resource->GetBlasTF32TensorCoreHandleCreator());
+  gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
+  gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator());
+  gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
+  gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice());
+
+  gpu_context->SetComputeCapability(gpu_resource->GetGpuComputeCapability());
+  gpu_context->SetMaxThreadsPerBlock(gpu_resource->GetGpuMaxThreadsPerBlock());
+  gpu_context->SetMaxThreadsPerMultiProcessor(
+      gpu_resource->GetGpuMaxThreadsPerMp());
+  gpu_context->SetMaxGridDimSize(gpu_resource->GetGpuMaxGridDimSize());
+  gpu_context->SetMultiProcessors(gpu_resource->GetGPUMultiProcessors());
+  gpu_context->SetDriverVersion(gpu_resource->GetGpuDriverVersion());
+  gpu_context->SetRuntimeVersion(gpu_resource->GetGpuRuntimeVersion());
+  VLOG(1) << "thread id is " << std::this_thread::get_id() << ", stream id is "
+          << reinterpret_cast<void *>(gpu_resource->GetStream())
+          << ", allotor ptr is "
+          << reinterpret_cast<void *>(
+                 memory::allocation::AllocatorFacade::Instance()
+                     .GetAllocator(place_, gpu_resource->GetStream())
+                     .get());
+}
+#endif
+}  // namespace
 
 using inference::Singleton;
 #ifdef PADDLE_WITH_TENSORRT
@@ -451,60 +506,7 @@ void AnalysisPredictor::InitDeviceContexts() {
           auto *gpu_resource =
               ResourceManager::Instance().GetGPUResource(predictor_stream_);
           auto *gpu_context = new InferGPUContext(place_);
-          gpu_context->SetAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(place_, gpu_resource->GetStream())
-                  .get());
-          gpu_context->SetPinnedAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(paddle::platform::CUDAPinnedPlace())
-                  .get());
-          gpu_context->SetHostAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(platform::CPUPlace())
-                  .get());
-          gpu_context->SetZeroAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetZeroAllocator(place_)
-                  .get());
-          gpu_context->SetHostZeroAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetZeroAllocator(platform::CPUPlace())
-                  .get());
-          gpu_context->SetGenerator(
-              phi::DefaultCUDAGenerator(place_.GetDeviceId()).get());
-          gpu_context->SetHostGenerator(phi::DefaultCPUGenerator().get());
-
-          gpu_context->SetStream(gpu_resource->GetStream());
-          gpu_context->SetBlasHandle(gpu_resource->GetBlasHandleCreator());
-          gpu_context->SetBlasTensorCoreHandle(
-              gpu_resource->GetBlasTensorCoreHandleCreator());
-          gpu_context->SetBlasTF32Handle(
-              gpu_resource->GetBlasTF32TensorCoreHandleCreator());
-          gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
-          gpu_context->SetSolverHandle(
-              gpu_resource->GetSolverDnHandleCreator());
-          gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
-          gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDeviceCreator());
-          gpu_context->SetComputeCapability(
-              gpu_resource->GetGpuComputeCapability());
-          gpu_context->SetMaxThreadsPerBlock(
-              gpu_resource->GetGpuMaxThreadsPerBlock());
-          gpu_context->SetMaxThreadsPerMultiProcessor(
-              gpu_resource->GetGpuMaxThreadsPerMp());
-          gpu_context->SetMaxGridDimSize(gpu_resource->GetGpuMaxGridDimSize());
-          gpu_context->SetMultiProcessors(
-              gpu_resource->GetGPUMultiProcessors());
-          gpu_context->SetDriverVersion(gpu_resource->GetGpuDriverVersion());
-          gpu_context->SetRuntimeVersion(gpu_resource->GetGpuRuntimeVersion());
-          VLOG(1) << "thread id is " << std::this_thread::get_id()
-                  << ", stream id is "
-                  << reinterpret_cast<void *>(gpu_resource->GetStream())
-                  << ", allotor ptr is "
-                  << reinterpret_cast<void *>(
-                         memory::allocation::AllocatorFacade::Instance()
-                             .GetAllocator(place_, gpu_resource->GetStream())
-                             .get());
+          UpdatePrivateDeviceContext(gpu_context, gpu_resource, place_);
           return std::unique_ptr<phi::DeviceContext>(gpu_context);
         }));
   }
@@ -2085,17 +2087,27 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
 #else
     cudaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #endif
-    ResourceManager::Instance().GpuResourceReBindStream(predictor_stream_,
+    ResourceManager::Instance().GpuResourceSwitchStream(predictor_stream_,
                                                         stream);
     predictor_stream_ = stream;
 
-    auto *dev_ctxs = reinterpret_cast<const std::map<
-        phi::Place,
-        std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
-        this->GetDeviceContexts());
-    auto *dev_ctx =
-        static_cast<InferGPUContext *>(dev_ctxs->at(place_).get().get());
-    dev_ctx->SetStream(stream);
+    auto *dev_ctxs = const_cast<
+        std::map<phi::Place,
+                 std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+        reinterpret_cast<const std::map<
+            phi::Place,
+            std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+            this->GetDeviceContexts()));
+
+    dev_ctxs->erase(place_);
+    dev_ctxs->emplace(
+        place_, std::async(std::launch::deferred, [=] {
+          auto *gpu_resource =
+              ResourceManager::Instance().GetGPUResource(predictor_stream_);
+          auto *gpu_context = new InferGPUContext(place_);
+          UpdatePrivateDeviceContext(gpu_context, gpu_resource, place_);
+          return std::unique_ptr<phi::DeviceContext>(gpu_context);
+        }));
   }
 
   return ZeroCopyRun();

diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
@@ -154,6 +154,7 @@ void GPUContextResource::InitGPUResource(void* stream) {
   }
 
   InitGpuProperties();
+  InitGpuEigenDevice();
 }
 
 void GPUContextResource::DestroyGPUResource() {
@@ -361,90 +362,6 @@ std::array<int, 3> GPUContextResource::GetGpuMaxGridDimSize() const {
   return max_grid_dim_size_;
 }
 
-void GPUContextResource::ReBindStream(gpuStream_t stream) {
-  owned_stream_ = false;
-  stream_ = stream;
-}
-
-void GPUContextResource::ReBindDnnHandle(gpuStream_t stream) const {
-  if (dnn_handle_) {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenSetStream(dnn_handle_, stream));
-#else
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cudnnSetStream(dnn_handle_, stream));
-#endif
-  }
-}
-
-void GPUContextResource::ReBindBlasHandle(gpuStream_t stream) const {
-  if (blas_handle_) {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::rocblas_set_stream(blas_handle_, stream));
-#else
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_handle_, stream));
-#endif
-  }
-}
-
-void GPUContextResource::ReBindBlasTensorCoreHandle(gpuStream_t stream) const {
-  if (blas_tensor_core_handle_) {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
-#else
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
-#endif
-  }
-}
-
-void GPUContextResource::ReBindBlasTF32Handle(gpuStream_t stream) const {
-  if (blas_tf32_tensor_core_handle_) {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_set_stream(
-        blas_tf32_tensor_core_handle_, stream));
-#else
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
-#endif
-  }
-}
-
-void GPUContextResource::ReBindSolverDnHandle(gpuStream_t stream) const {
-  if (solver_handle_) {
-#ifndef PADDLE_WITH_HIP
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cusolverDnSetStream(solver_handle_, stream));
-#endif
-  }
-}
-
-void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const {
-  if (sparse_handle_) {
-#if defined(PADDLE_WITH_CUDA)
-// The generic APIs is supported from CUDA10.1
-#if CUDA_VERSION >= 11000
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cusparseSetStream(sparse_handle_, stream));
-#endif
-#endif
-  }
-}
-
-void GPUContextResource::ReBindEigenDevice(gpuStream_t stream,
-                                           GPUPlace place) const {
-  if (eigen_stream_) {
-    auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
-                          .GetAllocator(place_)
-                          .get();
-    eigen_stream_->Reinitialize(stream, allocator, place);
-  }
-}
-
 #endif
 
 void ResourceManager::InitCPUResource() {
@@ -486,24 +403,16 @@ void ResourceManager::DestroyGPUResource(void* stream) {
 }
 
 void ResourceManager::Decrease(void* stream) {
-  PADDLE_ENFORCE_EQ(ref_count_.count(stream),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The stream[%p] not found in ref_count.", stream));
+  if (ref_count_.count(stream) == 0) return;
   --ref_count_[stream];
+
   if (ref_count_[stream] == 0) {
     ref_count_.erase(stream);
-    gpu_resources_.erase(stream);
+    if (gpu_resources_.count(stream) > 0) gpu_resources_.erase(stream);
   }
 }
 
-void ResourceManager::Increase(void* stream) {
-  PADDLE_ENFORCE_EQ(ref_count_.count(stream),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The stream[%p] not found in ref_count.", stream));
-  ++ref_count_[stream];
-}
+void ResourceManager::Increase(void* stream) { ++ref_count_[stream]; }
 
 GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
   PADDLE_ENFORCE_EQ(gpu_resources_.count(stream),
@@ -513,33 +422,29 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
   return gpu_resources_.at(stream).get();
 }
 
-void ResourceManager::GpuResourceReBindStream(void* old_stream,
+void ResourceManager::GpuResourceSwitchStream(void* old_stream,
                                               void* new_stream) {
+  // NOTE: add lock to support stream rebind in multi-thread
+  std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
+  if (old_stream == new_stream) return;
   PADDLE_ENFORCE_EQ(
       gpu_resources_.count(old_stream),
       true,
       platform::errors::InvalidArgument(
           "The stream[%p] not found in gpu_resources.", old_stream));
-  auto gpu_resource = std::move(gpu_resources_.at(old_stream));
-  DestroyGPUResource(old_stream);
-  PADDLE_ENFORCE_EQ(
-      ref_count_.count(old_stream),
-      0,
-      platform::errors::Fatal("gpu resources rebind stream failed."));
-
-  gpu_resource->ReBindStream(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindDnnHandle(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindBlasHandle(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindBlasTensorCoreHandle(
-      static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindBlasTF32Handle(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindSolverDnHandle(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindSparseHandle(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindEigenDevice(static_cast<gpuStream_t>(new_stream),
-                                  gpu_resource->Place());
 
-  ref_count_[new_stream]++;
-  gpu_resources_.emplace(new_stream, std::move(gpu_resource));
+  // NOTE: stream may be used by multiple predictor, skip resource
+  //       operation if resource of new_stream is already exists
+  bool new_stream_existed = gpu_resources_.count(new_stream) > 0;
+  if (!new_stream_existed) {
+    auto place = gpu_resources_.at(old_stream)->Place();
+    std::unique_ptr<GPUContextResource> resource{
+        new GPUContextResource(place, new_stream)};
+    gpu_resources_.emplace(new_stream, std::move(resource));
+  }
+
+  Decrease(old_stream);
+  Increase(new_stream);
 }
 
 int ResourceManager::RefCount(void* stream) const {

diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
@@ -82,16 +82,6 @@ class GPUContextResource {
   int GetGpuMaxThreadsPerBlock() const;
   std::array<int, 3> GetGpuMaxGridDimSize() const;
 
-  // If stream changes, we need to rebind all handle to new stream.
-  void ReBindStream(gpuStream_t stream);
-  void ReBindDnnHandle(gpuStream_t stream) const;
-  void ReBindBlasHandle(gpuStream_t stream) const;
-  void ReBindBlasTensorCoreHandle(gpuStream_t stream) const;
-  void ReBindBlasTF32Handle(gpuStream_t stream) const;
-  void ReBindSolverDnHandle(gpuStream_t stream) const;
-  void ReBindSparseHandle(gpuStream_t stream) const;
-  void ReBindEigenDevice(gpuStream_t stream, GPUPlace place) const;
-
  private:
   void InitGPUResource(void* stream);
   void DestroyGPUResource();
@@ -186,7 +176,7 @@ class ResourceManager {
   void DestroyGPUResource(void* stream);
   GPUContextResource* GetGPUResource(void* stream) const;
   int RefCount(void* stream) const;
-  void GpuResourceReBindStream(void* old_stream, void* new_stream);
+  void GpuResourceSwitchStream(void* old_stream, void* new_stream);
 
  private:
   void Decrease(void* stream);

diff --git a/test/cpp/inference/api/CMakeLists.txt b/test/cpp/inference/api/CMakeLists.txt
@@ -1028,6 +1028,18 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c)
     endif()
 
+    #TODO(inference): windows encounter a SEH error, we need to fix it.
+    if(NOT WIN32)
+      inference_analysis_test(
+        trt_rebind_stream_test
+        SRCS
+        trt_rebind_stream_test.cc
+        EXTRA_DEPS
+        paddle_inference_shared
+        ARGS
+        --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+    endif()
+
     set(TRT_MODEL_QUANT_RESNET_DIR
         "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
     if(NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz)
@@ -1378,6 +1390,10 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
   endif()
   if(WITH_GPU AND TENSORRT_FOUND)
     set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 240)
+    if(NOT WIN32)
+      set_tests_properties(trt_rebind_stream_test
+                           PROPERTIES TIMEOUT 360 LABELS "RUN_TYPE=EXCLUSIVE")
+    endif()
     if(WITH_MKLDNN)
       set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT
                                                                       120)