Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Inference] Update switch stream logical. #53589

Merged
merged 19 commits into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 74 additions & 62 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,61 @@
#endif

namespace paddle {
namespace {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
GPUContextResource *gpu_resource,
Place place_) {
gpu_context->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_, gpu_resource->GetStream())
.get());
gpu_context->SetPinnedAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace())
.get());
gpu_context->SetHostAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
gpu_context->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place_)
.get());
gpu_context->SetHostZeroAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
gpu_context->SetGenerator(
phi::DefaultCUDAGenerator(place_.GetDeviceId()).get());
gpu_context->SetHostGenerator(phi::DefaultCPUGenerator().get());

gpu_context->SetStream(gpu_resource->GetStream());
gpu_context->SetBlasHandle(gpu_resource->GetBlasHandleCreator());
gpu_context->SetBlasTensorCoreHandle(
gpu_resource->GetBlasTensorCoreHandleCreator());
gpu_context->SetBlasTF32Handle(
gpu_resource->GetBlasTF32TensorCoreHandleCreator());
gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator());
gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice());

gpu_context->SetComputeCapability(gpu_resource->GetGpuComputeCapability());
gpu_context->SetMaxThreadsPerBlock(gpu_resource->GetGpuMaxThreadsPerBlock());
gpu_context->SetMaxThreadsPerMultiProcessor(
gpu_resource->GetGpuMaxThreadsPerMp());
gpu_context->SetMaxGridDimSize(gpu_resource->GetGpuMaxGridDimSize());
gpu_context->SetMultiProcessors(gpu_resource->GetGPUMultiProcessors());
gpu_context->SetDriverVersion(gpu_resource->GetGpuDriverVersion());
gpu_context->SetRuntimeVersion(gpu_resource->GetGpuRuntimeVersion());
VLOG(1) << "thread id is " << std::this_thread::get_id() << ", stream id is "
<< reinterpret_cast<void *>(gpu_resource->GetStream())
<< ", allotor ptr is "
<< reinterpret_cast<void *>(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_, gpu_resource->GetStream())
.get());
}
#endif
} // namespace

using inference::Singleton;
#ifdef PADDLE_WITH_TENSORRT
Expand Down Expand Up @@ -451,60 +506,7 @@ void AnalysisPredictor::InitDeviceContexts() {
auto *gpu_resource =
ResourceManager::Instance().GetGPUResource(predictor_stream_);
auto *gpu_context = new InferGPUContext(place_);
gpu_context->SetAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_, gpu_resource->GetStream())
.get());
gpu_context->SetPinnedAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CUDAPinnedPlace())
.get());
gpu_context->SetHostAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(platform::CPUPlace())
.get());
gpu_context->SetZeroAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(place_)
.get());
gpu_context->SetHostZeroAllocator(
memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(platform::CPUPlace())
.get());
gpu_context->SetGenerator(
phi::DefaultCUDAGenerator(place_.GetDeviceId()).get());
gpu_context->SetHostGenerator(phi::DefaultCPUGenerator().get());

gpu_context->SetStream(gpu_resource->GetStream());
gpu_context->SetBlasHandle(gpu_resource->GetBlasHandleCreator());
gpu_context->SetBlasTensorCoreHandle(
gpu_resource->GetBlasTensorCoreHandleCreator());
gpu_context->SetBlasTF32Handle(
gpu_resource->GetBlasTF32TensorCoreHandleCreator());
gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
gpu_context->SetSolverHandle(
gpu_resource->GetSolverDnHandleCreator());
gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDeviceCreator());
gpu_context->SetComputeCapability(
gpu_resource->GetGpuComputeCapability());
gpu_context->SetMaxThreadsPerBlock(
gpu_resource->GetGpuMaxThreadsPerBlock());
gpu_context->SetMaxThreadsPerMultiProcessor(
gpu_resource->GetGpuMaxThreadsPerMp());
gpu_context->SetMaxGridDimSize(gpu_resource->GetGpuMaxGridDimSize());
gpu_context->SetMultiProcessors(
gpu_resource->GetGPUMultiProcessors());
gpu_context->SetDriverVersion(gpu_resource->GetGpuDriverVersion());
gpu_context->SetRuntimeVersion(gpu_resource->GetGpuRuntimeVersion());
VLOG(1) << "thread id is " << std::this_thread::get_id()
<< ", stream id is "
<< reinterpret_cast<void *>(gpu_resource->GetStream())
<< ", allotor ptr is "
<< reinterpret_cast<void *>(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_, gpu_resource->GetStream())
.get());
UpdatePrivateDeviceContext(gpu_context, gpu_resource, place_);
return std::unique_ptr<phi::DeviceContext>(gpu_context);
}));
}
Expand Down Expand Up @@ -2085,17 +2087,27 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
#else
cudaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
#endif
ResourceManager::Instance().GpuResourceReBindStream(predictor_stream_,
ResourceManager::Instance().GpuResourceSwitchStream(predictor_stream_,
stream);
predictor_stream_ = stream;

auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
auto *dev_ctx =
static_cast<InferGPUContext *>(dev_ctxs->at(place_).get().get());
dev_ctx->SetStream(stream);
auto *dev_ctxs = const_cast<
std::map<phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts()));

dev_ctxs->erase(place_);
dev_ctxs->emplace(
place_, std::async(std::launch::deferred, [=] {
auto *gpu_resource =
ResourceManager::Instance().GetGPUResource(predictor_stream_);
auto *gpu_context = new InferGPUContext(place_);
UpdatePrivateDeviceContext(gpu_context, gpu_resource, place_);
return std::unique_ptr<phi::DeviceContext>(gpu_context);
}));
}

return ZeroCopyRun();
Expand Down
137 changes: 21 additions & 116 deletions paddle/fluid/inference/api/resource_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ void GPUContextResource::InitGPUResource(void* stream) {
}

InitGpuProperties();
InitGpuEigenDevice();
}

void GPUContextResource::DestroyGPUResource() {
Expand Down Expand Up @@ -361,90 +362,6 @@ std::array<int, 3> GPUContextResource::GetGpuMaxGridDimSize() const {
return max_grid_dim_size_;
}

void GPUContextResource::ReBindStream(gpuStream_t stream) {
owned_stream_ = false;
stream_ = stream;
}

void GPUContextResource::ReBindDnnHandle(gpuStream_t stream) const {
if (dnn_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenSetStream(dnn_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cudnnSetStream(dnn_handle_, stream));
#endif
}
}

void GPUContextResource::ReBindBlasHandle(gpuStream_t stream) const {
if (blas_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_handle_, stream));
#endif
}
}

void GPUContextResource::ReBindBlasTensorCoreHandle(gpuStream_t stream) const {
if (blas_tensor_core_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
#endif
}
}

void GPUContextResource::ReBindBlasTF32Handle(gpuStream_t stream) const {
if (blas_tf32_tensor_core_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_set_stream(
blas_tf32_tensor_core_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
#endif
}
}

void GPUContextResource::ReBindSolverDnHandle(gpuStream_t stream) const {
if (solver_handle_) {
#ifndef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cusolverDnSetStream(solver_handle_, stream));
#endif
}
}

void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const {
if (sparse_handle_) {
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cusparseSetStream(sparse_handle_, stream));
#endif
#endif
}
}

void GPUContextResource::ReBindEigenDevice(gpuStream_t stream,
GPUPlace place) const {
if (eigen_stream_) {
auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_)
.get();
eigen_stream_->Reinitialize(stream, allocator, place);
}
}

#endif

void ResourceManager::InitCPUResource() {
Expand Down Expand Up @@ -486,24 +403,16 @@ void ResourceManager::DestroyGPUResource(void* stream) {
}

void ResourceManager::Decrease(void* stream) {
PADDLE_ENFORCE_EQ(ref_count_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in ref_count.", stream));
if (ref_count_.count(stream) == 0) return;
--ref_count_[stream];

if (ref_count_[stream] == 0) {
ref_count_.erase(stream);
gpu_resources_.erase(stream);
if (gpu_resources_.count(stream) > 0) gpu_resources_.erase(stream);
}
}

void ResourceManager::Increase(void* stream) {
PADDLE_ENFORCE_EQ(ref_count_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in ref_count.", stream));
++ref_count_[stream];
}
void ResourceManager::Increase(void* stream) { ++ref_count_[stream]; }

GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
PADDLE_ENFORCE_EQ(gpu_resources_.count(stream),
Expand All @@ -513,33 +422,29 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
return gpu_resources_.at(stream).get();
}

void ResourceManager::GpuResourceReBindStream(void* old_stream,
void ResourceManager::GpuResourceSwitchStream(void* old_stream,
void* new_stream) {
// NOTE: add lock to support stream rebind in multi-thread
std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
if (old_stream == new_stream) return;
PADDLE_ENFORCE_EQ(
gpu_resources_.count(old_stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in gpu_resources.", old_stream));
auto gpu_resource = std::move(gpu_resources_.at(old_stream));
DestroyGPUResource(old_stream);
PADDLE_ENFORCE_EQ(
ref_count_.count(old_stream),
0,
platform::errors::Fatal("gpu resources rebind stream failed."));

gpu_resource->ReBindStream(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindDnnHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasTensorCoreHandle(
static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasTF32Handle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindSolverDnHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindSparseHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindEigenDevice(static_cast<gpuStream_t>(new_stream),
gpu_resource->Place());

ref_count_[new_stream]++;
gpu_resources_.emplace(new_stream, std::move(gpu_resource));
// NOTE: stream may be used by multiple predictor, skip resource
// operation if resource of new_stream is already exists
bool new_stream_existed = gpu_resources_.count(new_stream) > 0;
if (!new_stream_existed) {
auto place = gpu_resources_.at(old_stream)->Place();
std::unique_ptr<GPUContextResource> resource{
new GPUContextResource(place, new_stream)};
gpu_resources_.emplace(new_stream, std::move(resource));
}

Decrease(old_stream);
Increase(new_stream);
}

int ResourceManager::RefCount(void* stream) const {
Expand Down
12 changes: 1 addition & 11 deletions paddle/fluid/inference/api/resource_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,6 @@ class GPUContextResource {
int GetGpuMaxThreadsPerBlock() const;
std::array<int, 3> GetGpuMaxGridDimSize() const;

// If stream changes, we need to rebind all handle to new stream.
void ReBindStream(gpuStream_t stream);
void ReBindDnnHandle(gpuStream_t stream) const;
void ReBindBlasHandle(gpuStream_t stream) const;
void ReBindBlasTensorCoreHandle(gpuStream_t stream) const;
void ReBindBlasTF32Handle(gpuStream_t stream) const;
void ReBindSolverDnHandle(gpuStream_t stream) const;
void ReBindSparseHandle(gpuStream_t stream) const;
void ReBindEigenDevice(gpuStream_t stream, GPUPlace place) const;

private:
void InitGPUResource(void* stream);
void DestroyGPUResource();
Expand Down Expand Up @@ -186,7 +176,7 @@ class ResourceManager {
void DestroyGPUResource(void* stream);
GPUContextResource* GetGPUResource(void* stream) const;
int RefCount(void* stream) const;
void GpuResourceReBindStream(void* old_stream, void* new_stream);
void GpuResourceSwitchStream(void* old_stream, void* new_stream);

private:
void Decrease(void* stream);
Expand Down
16 changes: 16 additions & 0 deletions test/cpp/inference/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,18 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c)
endif()

#TODO(inference): windows encounter a SEH error, we need to fix it.
if(NOT WIN32)
inference_analysis_test(
trt_rebind_stream_test
SRCS
trt_rebind_stream_test.cc
EXTRA_DEPS
paddle_inference_shared
ARGS
--infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
endif()

set(TRT_MODEL_QUANT_RESNET_DIR
"${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
if(NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz)
Expand Down Expand Up @@ -1378,6 +1390,10 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
endif()
if(WITH_GPU AND TENSORRT_FOUND)
set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 240)
if(NOT WIN32)
set_tests_properties(trt_rebind_stream_test
PROPERTIES TIMEOUT 360 LABELS "RUN_TYPE=EXCLUSIVE")
endif()
if(WITH_MKLDNN)
set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT
120)
Expand Down
Loading