PaddlePaddle · sneaxiy · Sep 29, 2021 · Sep 28, 2021 · Sep 28, 2021 · Sep 28, 2021
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -82,7 +82,11 @@ endif()
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy )
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+
+if (WITH_GPU)
+  target_link_libraries(allocator_facade cuda_graph)
+endif()
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
 if (WITH_TESTING)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -32,6 +32,9 @@
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_graph.h"
+#endif
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
@@ -47,17 +50,64 @@ PADDLE_DEFINE_EXPORTED_bool(
     "Whether to use system allocator to allocate CPU and GPU memory. "
     "Only used for unittests.");
 
+DECLARE_string(allocator_strategy);
+
 namespace paddle {
 namespace memory {
 namespace allocation {
 
+#ifdef PADDLE_WITH_CUDA
+class CUDAGraphAllocator
+    : public Allocator,
+      public std::enable_shared_from_this<CUDAGraphAllocator> {
+ private:
+  class PrivateAllocation : public Allocation {
+   public:
+    PrivateAllocation(CUDAGraphAllocator* allocator,
+                      AllocationPtr underlying_allocation)
+        : Allocation(underlying_allocation->ptr(),
+                     underlying_allocation->size(),
+                     underlying_allocation->place()),
+          allocator_(allocator->shared_from_this()),
+          underlying_allocation_(std::move(underlying_allocation)) {}
+
+   private:
+    std::shared_ptr<Allocator> allocator_;
+    AllocationPtr underlying_allocation_;
+  };
+
+  explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
+      : underlying_allocator_(allocator) {}
+
+ public:
+  static std::shared_ptr<Allocator> Create(
+      const std::shared_ptr<Allocator>& allocator) {
+    return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
+  }
+
+ protected:
+  Allocation* AllocateImpl(size_t size) {
+    VLOG(10) << "Allocate " << size << " for CUDA Graph";
+    return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
+  }
+
+  void FreeImpl(Allocation* allocation) {
+    VLOG(10) << "delete for CUDA Graph";
+    delete allocation;
+  }
+
+ private:
+  std::shared_ptr<Allocator> underlying_allocator_;
+};
+#endif
+
 class AllocatorFacadePrivate {
  public:
   using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
 
-  AllocatorFacadePrivate() {
-    auto strategy = GetAllocatorStrategy();
-    switch (strategy) {
+  explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
+    strategy_ = GetAllocatorStrategy();
+    switch (strategy_) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCPUAllocator();
 #ifdef PADDLE_WITH_XPU
@@ -91,7 +141,8 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
-          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id));
+          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
+                                      allow_free_idle_chunk);
         }
         InitNaiveBestFitCUDAPinnedAllocator();
 #endif
@@ -117,7 +168,7 @@ class AllocatorFacadePrivate {
 
       default: {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unsupported allocator strategy: %d", static_cast<int>(strategy)));
+            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
       }
     }
     InitZeroSizeAllocators();
@@ -130,11 +181,29 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe();
   }
 
+  inline const AllocatorMap& GetAllocatorMap() {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      auto id = platform::CUDAGraph::CapturingID();
+      auto iter = cuda_graph_allocator_map_.find(id);
+      PADDLE_ENFORCE_NE(
+          iter, cuda_graph_allocator_map_.end(),
+          platform::errors::PermissionDenied(
+              "No memory pool is prepared for CUDA Graph capturing."));
+      return iter->second->allocators_;
+    } else {
+      return allocators_;
+    }
+#else
+    return allocators_;
+#endif
+  }
+
   inline const std::shared_ptr<Allocator>& GetAllocator(
       const platform::Place& place, size_t size) {
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
-                                                          : allocators_)
+                                                          : GetAllocatorMap())
                   : zero_size_allocators_);
     auto iter = allocators.find(place);
     PADDLE_ENFORCE_NE(iter, allocators.end(),
@@ -145,6 +214,7 @@ class AllocatorFacadePrivate {
 
  private:
   void InitSystemAllocators() {
+    if (!system_allocators_.empty()) return;
     system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
 #ifdef PADDLE_WITH_XPU
     int device_count = platform::GetXPUDeviceCount();
@@ -183,10 +253,11 @@ class AllocatorFacadePrivate {
     allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
   }
 
-  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) {
+  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
+                                   bool allow_free_idle_chunk) {
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize());
+        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
   }
 #endif
 
@@ -226,6 +297,7 @@ class AllocatorFacadePrivate {
   };
 
   void InitZeroSizeAllocators() {
+    if (!zero_size_allocators_.empty()) return;
     std::vector<platform::Place> places;
     places.emplace_back(platform::CPUPlace());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -279,12 +351,57 @@ class AllocatorFacadePrivate {
     }
   }
 
+#ifdef PADDLE_WITH_CUDA
+
+ public:
+  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
+    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
+                      platform::errors::InvalidArgument(
+                          "CUDA Graph is only supported when the "
+                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
+                          "FLAGS_allocator_strategy=\"%s\"",
+                          FLAGS_allocator_strategy));
+    auto& allocator = cuda_graph_allocator_map_[id];
+    PADDLE_ENFORCE_EQ(
+        allocator.get(), nullptr,
+        platform::errors::InvalidArgument(
+            "The memory pool of the CUDA Graph with ID %d have been prepared.",
+            id));
+    allocator.reset(
+        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+    for (auto& item : allocator->allocators_) {
+      auto& old_allocator = item.second;
+      old_allocator = CUDAGraphAllocator::Create(old_allocator);
+    }
+    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
+  }
+
+  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
+    auto iter = cuda_graph_allocator_map_.find(id);
+    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
+                      platform::errors::InvalidArgument(
+                          "Cannot find CUDA Graph with ID = %d", id));
+    cuda_graph_allocator_map_.erase(iter);
+    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
+  }
+#endif
+
  private:
   AllocatorMap allocators_;
-  AllocatorMap zero_size_allocators_;
-  AllocatorMap system_allocators_;
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
+      cuda_graph_allocator_map_;
+#endif
+  AllocatorStrategy strategy_;
+
+  static AllocatorMap zero_size_allocators_;
+  static AllocatorMap system_allocators_;
 };
 
+AllocatorFacadePrivate::AllocatorMap
+    AllocatorFacadePrivate::zero_size_allocators_;
+AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;
+
 // Pimpl. Make interface clean.
 AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
 // delete m_ may cause core dump when the destructor of python in conflict with
@@ -316,6 +433,16 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
+#ifdef PADDLE_WITH_CUDA
+void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
+  return m_->PrepareMemoryPoolForCUDAGraph(id);
+}
+
+void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
+  return m_->RemoveMemoryPoolOfCUDAGraph(id);
+}
+#endif
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -18,6 +18,9 @@
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/gpu_info.h"
+#endif
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -54,6 +57,11 @@ class AllocatorFacade {
   uint64_t Release(const platform::Place& place);
   const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
 
+#ifdef PADDLE_WITH_CUDA
+  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
+  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
+#endif
+
   // TODO(yy): Allocate a Copy-On-Write allocation?
  private:
   AllocatorFacade();

diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -39,11 +39,12 @@ namespace allocation {
 
 AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
     const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
-    size_t chunk_size)
+    size_t chunk_size, bool allow_free_idle_chunk)
     : underlying_allocator_(
           std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
       alignment_(alignment),
-      chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)) {}
+      chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
+      allow_free_idle_chunk_(allow_free_idle_chunk) {}
 
 Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
   size = AlignedSize(size, alignment_);
@@ -139,6 +140,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
 }
 
 uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
+  if (!allow_free_idle_chunk_) {
+    return 0;
+  }
   uint64_t bytes = 0;
   for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) {
     auto &blocks = chunk_it->blocks_;

diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -31,7 +31,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
  public:
   AutoGrowthBestFitAllocator(
       const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
-      size_t chunk_size = 0);
+      size_t chunk_size = 0, bool allow_free_idle_chunk = true);
 
   bool IsAllocThreadSafe() const override { return true; }
 
@@ -86,6 +86,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
   std::list<Chunk> chunks_;
   size_t alignment_;
   size_t chunk_size_;
+  bool allow_free_idle_chunk_;
 
   SpinLock spinlock_;
 };

diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
@@ -59,9 +59,14 @@ cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 IF(WITH_GPU)
+    nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
     nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
     nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
+    nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
+ELSE()
+    cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
 ENDIF()
+
 IF(WITH_ROCM)
     hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
 ENDIF()