move device_memory_aligment from fluid to phi

PaddlePaddle · Dec 4, 2022 · b92fcd3 · b92fcd3
1 parent 2a8fc38
commit b92fcd3
Show file tree

Hide file tree

Showing 22 changed files with 158 additions and 140 deletions.
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
@@ -92,8 +92,7 @@ if(WITH_GPU)
          memory
          dynload_cuda
          variable_visitor
-         place
-         device_memory_aligment)
+         place)
   nv_library(
     grad_merge_all_reduce_op_handle
     SRCS grad_merge_all_reduce_op_handle.cc
@@ -105,7 +104,6 @@ if(WITH_GPU)
          dynload_cuda
          variable_visitor
          place
-         device_memory_aligment
          all_reduce_op_handle
          fused_all_reduce_op_handle)
 
@@ -170,8 +168,7 @@ elseif(WITH_ROCM)
          memory
          dynload_cuda
          variable_visitor
-         place
-         device_memory_aligment)
+         place)
   hip_library(
     grad_merge_all_reduce_op_handle
     SRCS grad_merge_all_reduce_op_handle.cc
@@ -183,7 +180,6 @@ elseif(WITH_ROCM)
          dynload_cuda
          variable_visitor
          place
-         device_memory_aligment
          all_reduce_op_handle
          fused_all_reduce_op_handle)
 
@@ -233,8 +229,7 @@ else()
          ddim
          memory
          variable_visitor
-         place
-         device_memory_aligment)
+         place)
   cc_library(
     grad_merge_all_reduce_op_handle
     SRCS grad_merge_all_reduce_op_handle.cc
@@ -245,7 +240,6 @@ else()
          memory
          variable_visitor
          place
-         device_memory_aligment
          all_reduce_op_handle
          fused_all_reduce_op_handle)
   if(WITH_DISTRIBUTE)

diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -16,9 +16,9 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
-#include "paddle/fluid/platform/device_memory_aligment.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 
 DEFINE_bool(skip_fused_all_reduce_check, false, "");
 DECLARE_bool(allreduce_record_one_event);
@@ -247,7 +247,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
     for (size_t k = 1; k < g_tensor.size(); ++k) {
       const void *cur_address = g_tensor.at(k - 1).second->data();
       int64_t len = g_tensor.at(k - 1).second->numel();
-      auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
+      auto offset = phi::Alignment(len * size_of_dtype, places_[0]);
       void *infer_next_address = reinterpret_cast<void *>(
           reinterpret_cast<uintptr_t>(cur_address) + offset);
       const void *next_address = g_tensor.at(k).second->data();
@@ -400,8 +400,7 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
             "The size of grad tensors of fused_all_reduce_op_handle  "
             "must be > 0, but got %d.",
             len));
-    *numel +=
-        platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
+    *numel += phi::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
   }
 }
 

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -153,7 +153,7 @@ cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling executor device_memory_aligment generator)
+sequence_pooling executor generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
@@ -167,7 +167,6 @@ if(WITH_XPU)
   cc_test(beam_search_decode_op_xpu_test SRCS beam_search_decode_op_xpu_test.cc DEPS lod_tensor)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib)
 endif()
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry)

diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/device_memory_aligment.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
@@ -250,9 +250,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
         framework::TensorCopy(
             *in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
 
-        offset += use_align ? platform::Alignment(len * size_of_dtype,
-                                                  context.GetPlace(),
-                                                  align_size) /
+        offset += use_align ? phi::Alignment(len * size_of_dtype,
+                                             context.GetPlace(),
+                                             align_size) /
                                   size_of_dtype
                             : len;
       }
@@ -274,9 +274,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           framework::TensorCopy(
               *out_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
         }
-        offset += use_align ? platform::Alignment(len * size_of_dtype,
-                                                  context.GetPlace(),
-                                                  align_size) /
+        offset += use_align ? phi::Alignment(len * size_of_dtype,
+                                             context.GetPlace(),
+                                             align_size) /
                                   size_of_dtype
                             : len;
       }
@@ -296,7 +296,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
               static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
           .Resize(dim);
       len = use_align
-                ? platform::Alignment(
+                ? phi::Alignment(
                       len * size_of_dtype, context.GetPlace(), align_size) /
                       size_of_dtype
                 : len;
@@ -342,12 +342,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           0,
           platform::errors::InvalidArgument(
               "The number of tensor `%s`'s elements is 0.", var_names[i]));
-      auto len = use_align ? platform::Alignment(
-                                 static_cast<size_t>(size) * size_of_dtype,
-                                 place,
-                                 align_size) /
-                                 size_of_dtype
-                           : static_cast<size_t>(size);
+      auto len = use_align
+                     ? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
+                                      place,
+                                      align_size) /
+                           size_of_dtype
+                     : static_cast<size_t>(size);
       const void *ptr =
           lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
       VLOG(4) << size << " " << len;

diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
@@ -378,10 +378,6 @@ if(WITH_GPU)
          stats
          op_proto_maker
          shape_inference)
-  nv_library(
-    device_memory_aligment
-    SRCS device_memory_aligment.cc
-    DEPS cpu_info gpu_info place)
 elseif(WITH_ROCM)
   hip_library(
     profiler
@@ -394,10 +390,6 @@ elseif(WITH_ROCM)
          stats
          op_proto_maker
          shape_inference)
-  hip_library(
-    device_memory_aligment
-    SRCS device_memory_aligment.cc
-    DEPS cpu_info gpu_info place)
 else()
   cc_library(
     profiler
@@ -409,10 +401,6 @@ else()
          stats
          op_proto_maker
          shape_inference)
-  cc_library(
-    device_memory_aligment
-    SRCS device_memory_aligment.cc
-    DEPS cpu_info place)
 endif()
 
 cc_test(

diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
@@ -79,11 +79,6 @@ size_t CpuMaxAllocSize() {
   return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
 }
 
-size_t CpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 4 KB.
-  return 1 << 12;
-}
-
 size_t CpuMaxChunkSize() {
   // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
   // or the initial_cpu_memory_in_mb.

diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
@@ -63,8 +63,7 @@ size_t CpuMaxAllocSize();
 //! Get the maximum allocation size for a machine.
 size_t CUDAPinnedMaxAllocSize();
 
-//! Get the minimum chunk size for buddy allocator.
-size_t CpuMinChunkSize();
+using phi::backends::cpu::CpuMinChunkSize;
 
 //! Get the maximum chunk size for buddy allocator.
 size_t CpuMaxChunkSize();

diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -124,11 +124,6 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
 
 size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
 
-size_t GpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
-
 size_t GpuMaxChunkSize() {
   size_t max_chunk_size = GpuMaxAllocSize();
   VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
@@ -410,8 +405,8 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
 CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
                               size_t size,
                               const CUmemAllocationProp *prop,
-                              unsigned long long flags,
-                              int dev_id) {  // NOLINT
+                              unsigned long long flags,  // NOLINT
+                              int dev_id) {
   return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
       handle, size, prop, flags);
 }

diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
 
 namespace paddle {
 namespace platform {
@@ -81,8 +82,7 @@ size_t GpuInitAllocSize();
 //! Get the re-allocation size of current GPU device.
 size_t GpuReallocSize();
 
-//! Get the minimum chunk size for GPU buddy allocator.
-size_t GpuMinChunkSize();
+using phi::backends::gpu::GpuMinChunkSize;
 
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();
@@ -140,8 +140,8 @@ gpuError_t GpuGetLastError();
 CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
                               size_t size,
                               const CUmemAllocationProp *prop,
-                              unsigned long long flags,
-                              int dev_id);  // NOLINT
+                              unsigned long long flags,  // NOLINT
+                              int dev_id);
 
 //! cuMemRelease with recorded info
 CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,

diff --git a/paddle/fluid/platform/device/mlu/mlu_info.cc b/paddle/fluid/platform/device/mlu/mlu_info.cc
@@ -226,11 +226,6 @@ size_t MLUInitAllocSize() { return MLUAllocSize(/* realloc = */ false); }
 
 size_t MLUReallocSize() { return MLUAllocSize(/* realloc = */ true); }
 
-size_t MLUMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
-
 size_t MLUMaxChunkSize() {
   size_t max_chunk_size = MLUMaxAllocSize();
   VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";

diff --git a/paddle/fluid/platform/device/mlu/mlu_info.h b/paddle/fluid/platform/device/mlu/mlu_info.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <cncl.h>
 #endif
 #include <vector>
+#include "paddle/phi/backends/mlu/mlu_info.h"
 
 namespace paddle {
 
@@ -89,8 +90,7 @@ size_t MLUInitAllocSize();
 //! Get the re-allocation size of current MLU device.
 size_t MLUReallocSize();
 
-//! Get the minimum chunk size for MLU buddy allocator.
-size_t MLUMinChunkSize();
+using phi::backends::mlu::MLUMinChunkSize;
 
 //! Get the maximum chunk size for MLU buddy allocator.
 size_t MLUMaxChunkSize();

diff --git a/paddle/fluid/platform/device/npu/npu_info.cc b/paddle/fluid/platform/device/npu/npu_info.cc
@@ -179,14 +179,6 @@ size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
 
 size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
 
-size_t NPUMinChunkSize() {
-  // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
-  // though no document specify that explicitly.
-  // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
-  // details.
-  return 1 << 9;
-}
-
 size_t NPUMaxChunkSize() {
   size_t max_chunk_size = NPUMaxAllocSize();
   VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";

diff --git a/paddle/fluid/platform/device/npu/npu_info.h b/paddle/fluid/platform/device/npu/npu_info.h
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include "acl/acl.h"
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/phi/backends/npu/npu_info.h"
 
 namespace paddle {
 namespace platform {
@@ -69,8 +70,7 @@ size_t NPUInitAllocSize();
 //! Get the re-allocation size of current NPU device.
 size_t NPUReallocSize();
 
-//! Get the minimum chunk size for NPU buddy allocator.
-size_t NPUMinChunkSize();
+using phi::backends::npu::NPUMinChunkSize;
 
 //! Get the maximum chunk size for NPU buddy allocator.
 size_t NPUMaxChunkSize();

diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
diff --git a/paddle/phi/backends/cpu/cpu_info.h b/paddle/phi/backends/cpu/cpu_info.h
@@ -39,6 +39,13 @@
 namespace phi {
 namespace backends {
 namespace cpu {
+
+//! Get the minimum chunk size for buddy allocator.
+inline size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
 typedef enum {
   isa_any,
   sse42,
@@ -51,6 +58,7 @@ typedef enum {
   avx512_mic_4ops,
   avx512_bf16,
 } cpu_isa_t;  // Instruction set architecture
+
 }  // namespace cpu
 }  // namespace backends
 }  // namespace phi