Skip to content

Commit

Permalink
move device_memory_aligment from fluid to phi
Browse files Browse the repository at this point in the history
  • Loading branch information
huangjiyi committed Dec 4, 2022
1 parent 2a8fc38 commit b92fcd3
Show file tree
Hide file tree
Showing 22 changed files with 158 additions and 140 deletions.
12 changes: 3 additions & 9 deletions paddle/fluid/framework/details/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ if(WITH_GPU)
memory
dynload_cuda
variable_visitor
place
device_memory_aligment)
place)
nv_library(
grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc
Expand All @@ -105,7 +104,6 @@ if(WITH_GPU)
dynload_cuda
variable_visitor
place
device_memory_aligment
all_reduce_op_handle
fused_all_reduce_op_handle)

Expand Down Expand Up @@ -170,8 +168,7 @@ elseif(WITH_ROCM)
memory
dynload_cuda
variable_visitor
place
device_memory_aligment)
place)
hip_library(
grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc
Expand All @@ -183,7 +180,6 @@ elseif(WITH_ROCM)
dynload_cuda
variable_visitor
place
device_memory_aligment
all_reduce_op_handle
fused_all_reduce_op_handle)

Expand Down Expand Up @@ -233,8 +229,7 @@ else()
ddim
memory
variable_visitor
place
device_memory_aligment)
place)
cc_library(
grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc
Expand All @@ -245,7 +240,6 @@ else()
memory
variable_visitor
place
device_memory_aligment
all_reduce_op_handle
fused_all_reduce_op_handle)
if(WITH_DISTRIBUTE)
Expand Down
7 changes: 3 additions & 4 deletions paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/backends/device_memory_aligment.h"

DEFINE_bool(skip_fused_all_reduce_check, false, "");
DECLARE_bool(allreduce_record_one_event);
Expand Down Expand Up @@ -247,7 +247,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
for (size_t k = 1; k < g_tensor.size(); ++k) {
const void *cur_address = g_tensor.at(k - 1).second->data();
int64_t len = g_tensor.at(k - 1).second->numel();
auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
auto offset = phi::Alignment(len * size_of_dtype, places_[0]);
void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = g_tensor.at(k).second->data();
Expand Down Expand Up @@ -400,8 +400,7 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
"The size of grad tensors of fused_all_reduce_op_handle "
"must be > 0, but got %d.",
len));
*numel +=
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
*numel += phi::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
}
}

Expand Down
3 changes: 1 addition & 2 deletions paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_

set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
lod_tensor maxouting unpooling pooling lod_rank_table context_project
sequence_pooling executor device_memory_aligment generator)
sequence_pooling executor generator)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
Expand All @@ -167,7 +167,6 @@ if(WITH_XPU)
cc_test(beam_search_decode_op_xpu_test SRCS beam_search_decode_op_xpu_test.cc DEPS lod_tensor)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib)
endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry)
Expand Down
28 changes: 14 additions & 14 deletions paddle/fluid/operators/coalesce_tensor_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/phi/backends/device_memory_aligment.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
Expand Down Expand Up @@ -250,9 +250,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(
*in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);

offset += use_align ? platform::Alignment(len * size_of_dtype,
context.GetPlace(),
align_size) /
offset += use_align ? phi::Alignment(len * size_of_dtype,
context.GetPlace(),
align_size) /
size_of_dtype
: len;
}
Expand All @@ -274,9 +274,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(
*out_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
}
offset += use_align ? platform::Alignment(len * size_of_dtype,
context.GetPlace(),
align_size) /
offset += use_align ? phi::Alignment(len * size_of_dtype,
context.GetPlace(),
align_size) /
size_of_dtype
: len;
}
Expand All @@ -296,7 +296,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim);
len = use_align
? platform::Alignment(
? phi::Alignment(
len * size_of_dtype, context.GetPlace(), align_size) /
size_of_dtype
: len;
Expand Down Expand Up @@ -342,12 +342,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
0,
platform::errors::InvalidArgument(
"The number of tensor `%s`'s elements is 0.", var_names[i]));
auto len = use_align ? platform::Alignment(
static_cast<size_t>(size) * size_of_dtype,
place,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
auto len = use_align
? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
place,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
const void *ptr =
lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
VLOG(4) << size << " " << len;
Expand Down
12 changes: 0 additions & 12 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -378,10 +378,6 @@ if(WITH_GPU)
stats
op_proto_maker
shape_inference)
nv_library(
device_memory_aligment
SRCS device_memory_aligment.cc
DEPS cpu_info gpu_info place)
elseif(WITH_ROCM)
hip_library(
profiler
Expand All @@ -394,10 +390,6 @@ elseif(WITH_ROCM)
stats
op_proto_maker
shape_inference)
hip_library(
device_memory_aligment
SRCS device_memory_aligment.cc
DEPS cpu_info gpu_info place)
else()
cc_library(
profiler
Expand All @@ -409,10 +401,6 @@ else()
stats
op_proto_maker
shape_inference)
cc_library(
device_memory_aligment
SRCS device_memory_aligment.cc
DEPS cpu_info place)
endif()

cc_test(
Expand Down
5 changes: 0 additions & 5 deletions paddle/fluid/platform/cpu_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,6 @@ size_t CpuMaxAllocSize() {
return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
}

size_t CpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 4 KB.
return 1 << 12;
}

size_t CpuMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
// or the initial_cpu_memory_in_mb.
Expand Down
3 changes: 1 addition & 2 deletions paddle/fluid/platform/cpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ size_t CpuMaxAllocSize();
//! Get the maximum allocation size for a machine.
size_t CUDAPinnedMaxAllocSize();

//! Get the minimum chunk size for buddy allocator.
size_t CpuMinChunkSize();
using phi::backends::cpu::CpuMinChunkSize;

//! Get the maximum chunk size for buddy allocator.
size_t CpuMaxChunkSize();
Expand Down
9 changes: 2 additions & 7 deletions paddle/fluid/platform/device/gpu/gpu_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,6 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }

size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }

size_t GpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}

size_t GpuMaxChunkSize() {
size_t max_chunk_size = GpuMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
Expand Down Expand Up @@ -410,8 +405,8 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop,
unsigned long long flags,
int dev_id) { // NOLINT
unsigned long long flags, // NOLINT
int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
handle, size, prop, flags);
}
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/platform/device/gpu/gpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ limitations under the License. */
#include <vector>

#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/phi/backends/gpu/gpu_info.h"

namespace paddle {
namespace platform {
Expand Down Expand Up @@ -81,8 +82,7 @@ size_t GpuInitAllocSize();
//! Get the re-allocation size of current GPU device.
size_t GpuReallocSize();

//! Get the minimum chunk size for GPU buddy allocator.
size_t GpuMinChunkSize();
using phi::backends::gpu::GpuMinChunkSize;

//! Get the maximum chunk size for GPU buddy allocator.
size_t GpuMaxChunkSize();
Expand Down Expand Up @@ -140,8 +140,8 @@ gpuError_t GpuGetLastError();
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop,
unsigned long long flags,
int dev_id); // NOLINT
unsigned long long flags, // NOLINT
int dev_id);

//! cuMemRelease with recorded info
CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
Expand Down
5 changes: 0 additions & 5 deletions paddle/fluid/platform/device/mlu/mlu_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,6 @@ size_t MLUInitAllocSize() { return MLUAllocSize(/* realloc = */ false); }

size_t MLUReallocSize() { return MLUAllocSize(/* realloc = */ true); }

size_t MLUMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}

size_t MLUMaxChunkSize() {
size_t max_chunk_size = MLUMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/platform/device/mlu/mlu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ limitations under the License. */
#include <cncl.h>
#endif
#include <vector>
#include "paddle/phi/backends/mlu/mlu_info.h"

namespace paddle {

Expand Down Expand Up @@ -89,8 +90,7 @@ size_t MLUInitAllocSize();
//! Get the re-allocation size of current MLU device.
size_t MLUReallocSize();

//! Get the minimum chunk size for MLU buddy allocator.
size_t MLUMinChunkSize();
using phi::backends::mlu::MLUMinChunkSize;

//! Get the maximum chunk size for MLU buddy allocator.
size_t MLUMaxChunkSize();
Expand Down
8 changes: 0 additions & 8 deletions paddle/fluid/platform/device/npu/npu_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,6 @@ size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }

size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }

size_t NPUMinChunkSize() {
// NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
// though no document specify that explicitly.
// See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
// details.
return 1 << 9;
}

size_t NPUMaxChunkSize() {
size_t max_chunk_size = NPUMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/platform/device/npu/npu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ limitations under the License. */

#include "acl/acl.h"
#include "paddle/fluid/platform/device/npu/enforce_npu.h"
#include "paddle/phi/backends/npu/npu_info.h"

namespace paddle {
namespace platform {
Expand Down Expand Up @@ -69,8 +70,7 @@ size_t NPUInitAllocSize();
//! Get the re-allocation size of current NPU device.
size_t NPUReallocSize();

//! Get the minimum chunk size for NPU buddy allocator.
size_t NPUMinChunkSize();
using phi::backends::npu::NPUMinChunkSize;

//! Get the maximum chunk size for NPU buddy allocator.
size_t NPUMaxChunkSize();
Expand Down
34 changes: 0 additions & 34 deletions paddle/fluid/platform/device_memory_aligment.h

This file was deleted.

8 changes: 8 additions & 0 deletions paddle/phi/backends/cpu/cpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@
namespace phi {
namespace backends {
namespace cpu {

//! Get the minimum chunk size for buddy allocator.
inline size_t CpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 4 KB.
return 1 << 12;
}

typedef enum {
isa_any,
sse42,
Expand All @@ -51,6 +58,7 @@ typedef enum {
avx512_mic_4ops,
avx512_bf16,
} cpu_isa_t; // Instruction set architecture

} // namespace cpu
} // namespace backends
} // namespace phi
Loading

0 comments on commit b92fcd3

Please sign in to comment.