Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add basic support for CUDA Graph #36190

Merged
merged 7 commits into from
Sep 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ endif()
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy )
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)

if (WITH_GPU)
target_link_libraries(allocator_facade cuda_graph)
endif()

cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
if (WITH_TESTING)
Expand Down
147 changes: 137 additions & 10 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_graph.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu/xpu_info.h"
#endif
Expand All @@ -47,17 +50,64 @@ PADDLE_DEFINE_EXPORTED_bool(
"Whether to use system allocator to allocate CPU and GPU memory. "
"Only used for unittests.");

DECLARE_string(allocator_strategy);

namespace paddle {
namespace memory {
namespace allocation {

#ifdef PADDLE_WITH_CUDA
class CUDAGraphAllocator
: public Allocator,
public std::enable_shared_from_this<CUDAGraphAllocator> {
private:
class PrivateAllocation : public Allocation {
public:
PrivateAllocation(CUDAGraphAllocator* allocator,
AllocationPtr underlying_allocation)
: Allocation(underlying_allocation->ptr(),
underlying_allocation->size(),
underlying_allocation->place()),
allocator_(allocator->shared_from_this()),
underlying_allocation_(std::move(underlying_allocation)) {}

private:
std::shared_ptr<Allocator> allocator_;
AllocationPtr underlying_allocation_;
};

explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
: underlying_allocator_(allocator) {}

public:
static std::shared_ptr<Allocator> Create(
const std::shared_ptr<Allocator>& allocator) {
return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
}

protected:
Allocation* AllocateImpl(size_t size) {
VLOG(10) << "Allocate " << size << " for CUDA Graph";
return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
}

void FreeImpl(Allocation* allocation) {
VLOG(10) << "delete for CUDA Graph";
delete allocation;
}

private:
std::shared_ptr<Allocator> underlying_allocator_;
};
#endif

class AllocatorFacadePrivate {
public:
using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;

AllocatorFacadePrivate() {
auto strategy = GetAllocatorStrategy();
switch (strategy) {
explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
strategy_ = GetAllocatorStrategy();
switch (strategy_) {
case AllocatorStrategy::kNaiveBestFit: {
InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_XPU
Expand Down Expand Up @@ -91,7 +141,8 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id));
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
allow_free_idle_chunk);
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
Expand All @@ -117,7 +168,7 @@ class AllocatorFacadePrivate {

default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Unsupported allocator strategy: %d", static_cast<int>(strategy)));
"Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
}
}
InitZeroSizeAllocators();
Expand All @@ -130,11 +181,29 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe();
}

inline const AllocatorMap& GetAllocatorMap() {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
auto id = platform::CUDAGraph::CapturingID();
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_NE(
iter, cuda_graph_allocator_map_.end(),
platform::errors::PermissionDenied(
"No memory pool is prepared for CUDA Graph capturing."));
return iter->second->allocators_;
} else {
return allocators_;
}
#else
return allocators_;
#endif
}

inline const std::shared_ptr<Allocator>& GetAllocator(
const platform::Place& place, size_t size) {
const auto& allocators =
(size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
: allocators_)
: GetAllocatorMap())
: zero_size_allocators_);
auto iter = allocators.find(place);
PADDLE_ENFORCE_NE(iter, allocators.end(),
Expand All @@ -145,6 +214,7 @@ class AllocatorFacadePrivate {

private:
void InitSystemAllocators() {
if (!system_allocators_.empty()) return;
system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
#ifdef PADDLE_WITH_XPU
int device_count = platform::GetXPUDeviceCount();
Expand Down Expand Up @@ -183,10 +253,11 @@ class AllocatorFacadePrivate {
allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
}

void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) {
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
bool allow_free_idle_chunk) {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize());
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
}
#endif

Expand Down Expand Up @@ -226,6 +297,7 @@ class AllocatorFacadePrivate {
};

void InitZeroSizeAllocators() {
if (!zero_size_allocators_.empty()) return;
std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace());
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Expand Down Expand Up @@ -279,12 +351,57 @@ class AllocatorFacadePrivate {
}
}

#ifdef PADDLE_WITH_CUDA

public:
void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
platform::errors::InvalidArgument(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=\"auto_growth\", but got "
"FLAGS_allocator_strategy=\"%s\"",
FLAGS_allocator_strategy));
auto& allocator = cuda_graph_allocator_map_[id];
PADDLE_ENFORCE_EQ(
allocator.get(), nullptr,
platform::errors::InvalidArgument(
"The memory pool of the CUDA Graph with ID %d have been prepared.",
id));
allocator.reset(
new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
for (auto& item : allocator->allocators_) {
auto& old_allocator = item.second;
old_allocator = CUDAGraphAllocator::Create(old_allocator);
}
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
}

void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
platform::errors::InvalidArgument(
"Cannot find CUDA Graph with ID = %d", id));
cuda_graph_allocator_map_.erase(iter);
VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
}
#endif

private:
AllocatorMap allocators_;
AllocatorMap zero_size_allocators_;
AllocatorMap system_allocators_;
#ifdef PADDLE_WITH_CUDA
std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
cuda_graph_allocator_map_;
#endif
AllocatorStrategy strategy_;

static AllocatorMap zero_size_allocators_;
static AllocatorMap system_allocators_;
};

AllocatorFacadePrivate::AllocatorMap
AllocatorFacadePrivate::zero_size_allocators_;
AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;

// Pimpl. Make interface clean.
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
// delete m_ may cause core dump when the destructor of python in conflict with
Expand Down Expand Up @@ -316,6 +433,16 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}

#ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
return m_->PrepareMemoryPoolForCUDAGraph(id);
}

void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
return m_->RemoveMemoryPoolOfCUDAGraph(id);
}
#endif

} // namespace allocation
} // namespace memory
} // namespace paddle
8 changes: 8 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif
#include "paddle/fluid/platform/place.h"

namespace paddle {
Expand Down Expand Up @@ -54,6 +57,11 @@ class AllocatorFacade {
uint64_t Release(const platform::Place& place);
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);

#ifdef PADDLE_WITH_CUDA
void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
#endif

// TODO(yy): Allocate a Copy-On-Write allocation?
private:
AllocatorFacade();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ namespace allocation {

AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
size_t chunk_size)
size_t chunk_size, bool allow_free_idle_chunk)
: underlying_allocator_(
std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
alignment_(alignment),
chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)) {}
chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
allow_free_idle_chunk_(allow_free_idle_chunk) {}

Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
size = AlignedSize(size, alignment_);
Expand Down Expand Up @@ -139,6 +140,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
}

uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
if (!allow_free_idle_chunk_) {
return 0;
}
uint64_t bytes = 0;
for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) {
auto &blocks = chunk_it->blocks_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
public:
AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
size_t chunk_size = 0);
size_t chunk_size = 0, bool allow_free_idle_chunk = true);

bool IsAllocThreadSafe() const override { return true; }

Expand Down Expand Up @@ -86,6 +86,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
std::list<Chunk> chunks_;
size_t alignment_;
size_t chunk_size_;
bool allow_free_idle_chunk_;

SpinLock spinlock_;
};
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,14 @@ cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

IF(WITH_GPU)
nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
ELSE()
cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
ENDIF()

IF(WITH_ROCM)
hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
ENDIF()
Expand Down
Loading