Skip to content

Commit

Permalink
add batch stream (PaddlePaddle#44524)
Browse files Browse the repository at this point in the history
  • Loading branch information
jiweibo authored and Aurelius84 committed Jul 29, 2022
1 parent e2f3b50 commit 0357e66
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 2 deletions.
29 changes: 27 additions & 2 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/resource_manager.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/inference/utils/model_utils.h"
#include "paddle/fluid/inference/utils/singleton.h"
Expand All @@ -56,6 +57,7 @@
#include "paddle/phi/common/backend.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/utils/string/split.h"

#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
Expand Down Expand Up @@ -1618,8 +1620,31 @@ bool AnalysisPredictor::ZeroCopyRun() {

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. "
"Please use config.SetExecStream instead.";
if (!private_context_) {
PADDLE_THROW(platform::errors::Fatal(
"Please use config.SetExecStream to init gpu resources, and then we "
"will bind gpu resources to execution stream."));
}

if (stream != predictor_stream_) {
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
#else
cudaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
#endif
ResourceManager::Instance().GpuResourceReBindStream(predictor_stream_,
stream);
predictor_stream_ = stream;

auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
auto *dev_ctx =
static_cast<InferGPUContext *>(dev_ctxs->at(place_).get().get());
dev_ctx->SetStream(stream);
}

return ZeroCopyRun();
}
#endif
Expand Down
112 changes: 112 additions & 0 deletions paddle/fluid/inference/api/resource_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,29 @@
#include <memory>
#include <mutex>
#include <unordered_map>
#include <utility>

#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/phi/backends/gpu/forwards.h"
#include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_resources.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/errors.h"
#include "paddle/phi/core/generator.h"
#include "unsupported/Eigen/CXX11/Tensor"

#include "paddle/fluid/platform/enforce.h"

#ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/dynload/cublas.h"
#include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/dynload/cusparse.h"
#endif // PADDLE_WITH_CUDA

namespace paddle {
namespace internal {

Expand Down Expand Up @@ -237,6 +249,8 @@ void GPUContextResource::DestroySparseHandle() {
phi::DestroySparseHandle(sparse_handle_);
}

phi::Place GPUContextResource::Place() const { return place_; }

gpuStream_t GPUContextResource::GetStream() const { return stream_; }

dnnHandle_t GPUContextResource::GetDnnHandle() const { return dnn_handle_; }
Expand Down Expand Up @@ -291,6 +305,75 @@ std::array<int, 3> GPUContextResource::GetGpuMaxGridDimSize() const {
return max_grid_dim_size_;
}

void GPUContextResource::ReBindStream(gpuStream_t stream) {
owned_stream_ = false;
stream_ = stream;
}

void GPUContextResource::ReBindDnnHandle(gpuStream_t stream) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenSetStream(dnn_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(dnn_handle_, stream));
#endif
}

void GPUContextResource::ReBindBlasHandle(gpuStream_t stream) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_handle_, stream));
#endif
}

void GPUContextResource::ReBindBlasTensorCoreHandle(gpuStream_t stream) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_tensor_core_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream));
#endif
}

void GPUContextResource::ReBindBlasTF32Handle(gpuStream_t stream) const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_set_stream(blas_tf32_tensor_core_handle_, stream));
#else
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream));
#endif
}

void GPUContextResource::ReBindSolverDnHandle(gpuStream_t stream) const {
#ifndef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cusolverDnSetStream(solver_handle_, stream));
#endif
}

void GPUContextResource::ReBindSparseHandle(gpuStream_t stream) const {
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(
phi::dynload::cusparseSetStream(sparse_handle_, stream));
#endif
#endif
}

void GPUContextResource::ReBindEigenDevice(gpuStream_t stream,
GPUPlace place) const {
auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place_)
.get();
eigen_stream_->Reinitialize(stream, allocator, place);
}

#endif

void ResourceManager::InitCPUResource() {
Expand Down Expand Up @@ -359,6 +442,35 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
return gpu_resources_.at(stream).get();
}

void ResourceManager::GpuResourceReBindStream(void* old_stream,
void* new_stream) {
PADDLE_ENFORCE_EQ(
gpu_resources_.count(old_stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in gpu_resources.", old_stream));
auto gpu_resource = std::move(gpu_resources_.at(old_stream));
DestroyGPUResource(old_stream);
PADDLE_ENFORCE_EQ(
ref_count_.count(old_stream),
0,
platform::errors::Fatal("gpu resources rebind stream failed."));

gpu_resource->ReBindStream(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindDnnHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasTensorCoreHandle(
static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasTF32Handle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindSolverDnHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindSparseHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindEigenDevice(static_cast<gpuStream_t>(new_stream),
gpu_resource->Place());

ref_count_[new_stream]++;
gpu_resources_.emplace(new_stream, std::move(gpu_resource));
}

int ResourceManager::RefCount(void* stream) const {
if (ref_count_.count(stream) == 0) return 0;
return ref_count_.at(stream);
Expand Down
13 changes: 13 additions & 0 deletions paddle/fluid/inference/api/resource_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "paddle/fluid/platform/macros.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/backends/cpu/forwards.h"
#include "paddle/phi/common/place.h"

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
Expand Down Expand Up @@ -52,6 +53,7 @@ class GPUContextResource {
public:
explicit GPUContextResource(const phi::Place& place, void* stream);
~GPUContextResource();
phi::Place Place() const;

gpuStream_t GetStream() const;
dnnHandle_t GetDnnHandle() const;
Expand All @@ -70,6 +72,16 @@ class GPUContextResource {
int GetGpuMaxThreadsPerBlock() const;
std::array<int, 3> GetGpuMaxGridDimSize() const;

// If stream changes, we need to rebind all handle to new stream.
void ReBindStream(gpuStream_t stream);
void ReBindDnnHandle(gpuStream_t stream) const;
void ReBindBlasHandle(gpuStream_t stream) const;
void ReBindBlasTensorCoreHandle(gpuStream_t stream) const;
void ReBindBlasTF32Handle(gpuStream_t stream) const;
void ReBindSolverDnHandle(gpuStream_t stream) const;
void ReBindSparseHandle(gpuStream_t stream) const;
void ReBindEigenDevice(gpuStream_t stream, GPUPlace place) const;

private:
void InitGPUResource(void* stream);
void DestroyGPUResource();
Expand Down Expand Up @@ -138,6 +150,7 @@ class ResourceManager {
void DestroyGPUResource(void* stream);
GPUContextResource* GetGPUResource(void* stream) const;
int RefCount(void* stream) const;
void GpuResourceReBindStream(void* old_stream, void* new_stream);

private:
void Decrease(void* stream);
Expand Down

0 comments on commit 0357e66

Please sign in to comment.