From 28133ae7d0e6e176a4730142e477f84b3d93c3c9 Mon Sep 17 00:00:00 2001 From: danleifeng Date: Fri, 1 Jul 2022 16:03:38 +0800 Subject: [PATCH 01/12] [gpups]refine adam aceessor;test=develop --- .../framework/fleet/heter_ps/feature_value.h | 208 ++++++++++++++++++ .../fleet/heter_ps/hashtable_kernel.cu | 26 +-- .../fleet/heter_ps/heter_comm_kernel.cu | 41 +--- .../fleet/heter_ps/heter_comm_kernel.h | 45 +--- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 61 +---- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 21 +- 6 files changed, 219 insertions(+), 183 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index 221915fc713a82..2486f34cc0e256 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" namespace paddle { @@ -185,6 +186,35 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { } }; + struct CommonPullValue { + /* + float show; + float click; + float embed_w; + std::vector embedx_w; + */ + + __host__ __device__ static int Dim(int embedx_dim) { return 3 + embedx_dim; } + __host__ __device__ int DimSize(size_t dim) { return sizeof(float); } + __host__ __device__ int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + __host__ __device__ int ShowIndex() { return 0; } + __host__ __device__ int ClickIndex() { return 1; } + __host__ __device__ int EmbedWIndex() { return 2; } + __host__ __device__ int EmbedxWIndex() { return 3; } + __host__ __device__ float& Show(float* val) { + return val[CommonPullValue::ShowIndex()]; + } + __host__ __device__ float& Click(float* val) { + return val[CommonPullValue::ClickIndex()]; + } + __host__ __device__ float& EmbedW(float* val) { + return val[CommonPullValue::EmbedWIndex()]; + } + __host__ __device__ float* EmbedxW(float* val) { + return val + CommonPullValue::EmbedxWIndex(); + } + }; + __host__ __device__ CommonFeatureValueAccessor() {} __host__ __device__ ~CommonFeatureValueAccessor() {} @@ -212,6 +242,183 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { return 0; } + +// build阶段从cpu_val赋值给gpu_val +__host__ __device__ void BuildFill(float* gpu_val, + float* cpu_val, + paddle::distributed::CtrDymfAccessor* cpu_table_accessor, + int mf_dim, + size_t cpu_fv_dim) { + + gpu_val[common_feature_value.DeltaScoreIndex()] = + cpu_val[cpu_table_accessor->common_feature_value.DeltaScoreIndex()]; + gpu_val[common_feature_value.ShowIndex()] = + cpu_val[cpu_table_accessor->common_feature_value.ShowIndex()]; + gpu_val[common_feature_value.ClickIndex()] = + cpu_val[cpu_table_accessor->common_feature_value.ClickIndex()]; + gpu_val[common_feature_value.SlotIndex()] = + cpu_val[cpu_table_accessor->common_feature_value.SlotIndex()]; + gpu_val[common_feature_value.EmbedWIndex()] = + cpu_val[cpu_table_accessor->common_feature_value.EmbedWIndex()]; + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + gpu_val[common_feature_value.EmbedG2SumIndex() + i] = + cpu_val[cpu_table_accessor->common_feature_value.EmbedG2SumIndex() + i]; + } + + cpu_val[cpu_table_accessor->common_feature_value.MfDimIndex()] = float(mf_dim); + gpu_val[common_feature_value.MfDimIndex()] = mf_dim; + if (cpu_fv_dim > cpu_table_accessor->GetAccessorInfo().dim - + cpu_table_accessor->GetAccessorInfo().mf_size / sizeof(float)) { + gpu_val[common_feature_value.MfSizeIndex()] = + common_feature_value.MFSize(mf_dim) / sizeof(float); + + for (int x = 0; x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); + x++) { + gpu_val[common_feature_value.EmbedxG2SumIndex() + x] = + cpu_val[cpu_table_accessor->common_feature_value.EmbedxG2SumIndex() + x]; + } + } else { + gpu_val[common_feature_value.MfSizeIndex()] = 0; + for (int x = common_feature_value.EmbedxG2SumIndex(); + x < int(common_feature_value.Size(mf_dim) / sizeof(float)); x++){ + gpu_val[x] = 0; + } + } +} + + +// dump_to_cpu阶段从gpu_val赋值给cpu_val +__host__ __device__ void DumpFill(float* cpu_val, + float* gpu_val, + paddle::distributed::CtrDymfAccessor* cpu_table_accessor, + int mf_dim) { + + cpu_val[cpu_table_accessor->common_feature_value.DeltaScoreIndex()] = + gpu_val[common_feature_value.DeltaScoreIndex()]; + cpu_val[cpu_table_accessor->common_feature_value.ShowIndex()] = + gpu_val[common_feature_value.ShowIndex()]; + cpu_val[cpu_table_accessor->common_feature_value.ClickIndex()] = + gpu_val[common_feature_value.ClickIndex()]; + cpu_val[cpu_table_accessor->common_feature_value.EmbedWIndex()] = + gpu_val[common_feature_value.EmbedWIndex()]; + cpu_val[cpu_table_accessor->common_feature_value.SlotIndex()] = + gpu_val[common_feature_value.SlotIndex()]; + + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + cpu_val[cpu_table_accessor->common_feature_value.EmbedG2SumIndex() + i] = + gpu_val[common_feature_value.EmbedG2SumIndex() + i]; + } + + if (gpu_val[common_feature_value.MfSizeIndex()] > 0) { + + for (int x = 0; x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); + x++) { + cpu_val[cpu_table_accessor->common_feature_value.EmbedxG2SumIndex() + x] = + gpu_val[common_feature_value.EmbedxG2SumIndex() + x]; + } + } +} + + +// dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel 中从src_val赋值给dest_val +__host__ __device__ void FeatureValueFill(float* dest_val, + float* src_val, + int mf_dim) { + *(reinterpret_cast(dest_val + common_feature_value.CpuPtrIndex())) = + *(reinterpret_cast(src_val + common_feature_value.CpuPtrIndex())); + dest_val[common_feature_value.DeltaScoreIndex()] = src_val[common_feature_value.DeltaScoreIndex()]; + dest_val[common_feature_value.ShowIndex()] = src_val[common_feature_value.ShowIndex()]; + dest_val[common_feature_value.ClickIndex()] = src_val[common_feature_value.ClickIndex()]; + dest_val[common_feature_value.EmbedWIndex()] = src_val[common_feature_value.EmbedWIndex()]; + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + dest_val[common_feature_value.EmbedG2SumIndex() + i] = + src_val[common_feature_value.EmbedG2SumIndex() + i]; + } + dest_val[common_feature_value.SlotIndex()] = src_val[common_feature_value.SlotIndex()]; + dest_val[common_feature_value.MfDimIndex()] = mf_dim; + dest_val[common_feature_value.MfSizeIndex()] = src_val[common_feature_value.MfSizeIndex()]; + + for (int x = common_feature_value.EmbedxG2SumIndex(); + x < int(common_feature_value.Size(mf_dim) / sizeof(float)); x++){ + dest_val[x] = src_val[x]; + } +} + + +// dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel 中从src_val赋值给dest_val +__host__ __device__ void PushValueFill(float* dest_val, + const float* src_val) { + dest_val[common_push_value.SlotIndex()] = src_val[common_push_value.SlotIndex()]; + dest_val[common_push_value.ShowIndex()] = src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] = src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.MfDimIndex()] = src_val[common_push_value.MfDimIndex()]; + dest_val[common_push_value.EmbedGIndex()] = src_val[common_push_value.EmbedGIndex()]; + + for (int x = 0; x < int(src_val[common_push_value.MfDimIndex()]); x++) { + dest_val[common_push_value.EmbedxGIndex() + x] = src_val[common_push_value.EmbedxGIndex() + x]; + } +} + +// update_basic 阶段 gpukernel 中从src_val赋值给dest_val +__host__ __device__ void PushValueFillBasic(float* dest_val, + const float* src_val) { + dest_val[common_push_value.SlotIndex()] = src_val[common_push_value.SlotIndex()]; + dest_val[common_push_value.ShowIndex()] = src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] = src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.MfDimIndex()] = src_val[common_push_value.MfDimIndex()]; + dest_val[common_push_value.EmbedGIndex()] = src_val[common_push_value.EmbedGIndex()]; + +} + + +// merge_one 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val +__host__ __device__ void MergePushValue(float* dest_val, + const float* src_val) { + dest_val[common_push_value.ShowIndex()] += src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] += src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.EmbedGIndex()] += src_val[common_push_value.EmbedGIndex()]; + for (int j = 0; j < int(dest_val[common_push_value.MfDimIndex()]); j++) { + dest_val[common_push_value.EmbedxGIndex() + j] += src_val[common_push_value.EmbedxGIndex() + j]; + } +} + + +// merge_basic 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val +__host__ __device__ void MergePushValueBasic(float* dest_val, + const float* src_val) { + dest_val[common_push_value.ShowIndex()] += src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] += src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.EmbedGIndex()] += src_val[common_push_value.EmbedGIndex()]; +} + +// PullCopy 阶段 gpukernel 中 FeatureValue回填到PullValue +__host__ __device__ void Select(float* dest_val, + float* src_val, + uint64_t* key, + int mf_dim) { + if (*key == 0) { + *(dest_val + common_pull_value.ShowIndex()) = 0; + *(dest_val + common_pull_value.ClickIndex()) = 0; + *(dest_val + common_pull_value.EmbedWIndex()) = 0; + } else { + *(dest_val + common_pull_value.ShowIndex()) = src_val[common_feature_value.ShowIndex()]; + *(dest_val + common_pull_value.ClickIndex()) = src_val[common_feature_value.ClickIndex()]; + *(dest_val + common_pull_value.EmbedWIndex()) = src_val[common_feature_value.EmbedWIndex()]; + } + + if (src_val[common_feature_value.MfSizeIndex()] == 0 || *key == 0) { + for (int j = 0; j < mf_dim; j++) { + *(dest_val + common_pull_value.EmbedxWIndex() + j) = 0; + } + } else { + for (int j = 0; j < mf_dim; j++) { + *(dest_val + common_pull_value.EmbedxWIndex() + j) = + src_val[common_feature_value.EmbedxWOffsetIndex(src_val) + j]; + } + } +} + + __host__ __device__ std::string ParseToString(const float* v, int param_size) { /* uint64_t cpu_ptr; // 2float @@ -251,6 +458,7 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { public: CommonFeatureValue common_feature_value; CommonPushValue common_push_value; + CommonPullValue common_pull_value; }; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index c430dfa669c450..a1cdfa660eccff 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -97,31 +97,7 @@ __global__ void dy_mf_search_kernel(Table* table, float* input = it->second; int mf_dim = int(input[feature_value_accessor.common_feature_value.MfDimIndex()]); - *(reinterpret_cast(cur + feature_value_accessor.common_feature_value.CpuPtrIndex())) = - *(reinterpret_cast(input + feature_value_accessor.common_feature_value.CpuPtrIndex())); - cur[feature_value_accessor.common_feature_value.DeltaScoreIndex()] = - input[feature_value_accessor.common_feature_value.DeltaScoreIndex()]; - cur[feature_value_accessor.common_feature_value.ShowIndex()] = - input[feature_value_accessor.common_feature_value.ShowIndex()]; - cur[feature_value_accessor.common_feature_value.ClickIndex()] = - input[feature_value_accessor.common_feature_value.ClickIndex()]; - cur[feature_value_accessor.common_feature_value.EmbedWIndex()] = - input[feature_value_accessor.common_feature_value.EmbedWIndex()]; - for (int x = 0; x < feature_value_accessor.common_feature_value.EmbedDim(); x++) { - cur[feature_value_accessor.common_feature_value.EmbedG2SumIndex() + x] = - input[feature_value_accessor.common_feature_value.EmbedG2SumIndex() + x]; - } - cur[feature_value_accessor.common_feature_value.SlotIndex()] = - input[feature_value_accessor.common_feature_value.SlotIndex()]; - cur[feature_value_accessor.common_feature_value.MfDimIndex()] = - input[feature_value_accessor.common_feature_value.MfDimIndex()]; - cur[feature_value_accessor.common_feature_value.MfSizeIndex()] = - input[feature_value_accessor.common_feature_value.MfSizeIndex()]; - - for (int x = feature_value_accessor.common_feature_value.EmbedxG2SumIndex(); - x < int(feature_value_accessor.common_feature_value.Size(mf_dim) / sizeof(float)); x++){ - cur[x] = input[x]; - } + feature_value_accessor.FeatureValueFill(cur, input, mf_dim); } } } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index 415865ebba8dde..84946860300abd 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -128,21 +128,7 @@ __global__ void dy_mf_fill_shard_grads_kernel( float* cur = (float*)((char*)d_shard_grads + i * grad_value_size); float* shard_val = (float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); - cur[feature_value_accessor.common_push_value.SlotIndex()] = - shard_val[feature_value_accessor.common_push_value.SlotIndex()]; - cur[feature_value_accessor.common_push_value.ShowIndex()] = - shard_val[feature_value_accessor.common_push_value.ShowIndex()]; - cur[feature_value_accessor.common_push_value.ClickIndex()] = - shard_val[feature_value_accessor.common_push_value.ClickIndex()]; - cur[feature_value_accessor.common_push_value.MfDimIndex()] = - shard_val[feature_value_accessor.common_push_value.MfDimIndex()]; - cur[feature_value_accessor.common_push_value.EmbedGIndex()] = - shard_val[feature_value_accessor.common_push_value.EmbedGIndex()]; - - for (int x = 0; x < int(shard_val[feature_value_accessor.common_push_value.MfDimIndex()]); x++) { - cur[feature_value_accessor.common_push_value.EmbedxGIndex() + x] = - shard_val[feature_value_accessor.common_push_value.EmbedxGIndex() + x]; - } + feature_value_accessor.PushValueFill(cur, shard_val); } } @@ -219,30 +205,7 @@ __global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals, float* d_vals, float* shard_val = (float*)((char*)d_shard_vals + uint64_t(i) * val_size); int mf_dim = int(shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]); - *(reinterpret_cast(cur + feature_value_accessor.common_feature_value.CpuPtrIndex())) = - *(reinterpret_cast(shard_val + feature_value_accessor.common_feature_value.CpuPtrIndex())); - cur[feature_value_accessor.common_feature_value.DeltaScoreIndex()] = - shard_val[feature_value_accessor.common_feature_value.DeltaScoreIndex()]; - cur[feature_value_accessor.common_feature_value.ShowIndex()] = - shard_val[feature_value_accessor.common_feature_value.ShowIndex()]; - cur[feature_value_accessor.common_feature_value.ClickIndex()] = - shard_val[feature_value_accessor.common_feature_value.ClickIndex()]; - cur[feature_value_accessor.common_feature_value.EmbedWIndex()] = - shard_val[feature_value_accessor.common_feature_value.EmbedWIndex()]; - for (int i = 0; i < feature_value_accessor.common_feature_value.EmbedDim(); i++) { - cur[feature_value_accessor.common_feature_value.EmbedG2SumIndex() + i] = - shard_val[feature_value_accessor.common_feature_value.EmbedG2SumIndex() + i]; - } - cur[feature_value_accessor.common_feature_value.SlotIndex()] = - shard_val[feature_value_accessor.common_feature_value.SlotIndex()]; - cur[feature_value_accessor.common_feature_value.MfDimIndex()] = mf_dim; - cur[feature_value_accessor.common_feature_value.MfSizeIndex()] = - shard_val[feature_value_accessor.common_feature_value.MfSizeIndex()]; - - for (int x = feature_value_accessor.common_feature_value.EmbedxG2SumIndex(); - x < int(feature_value_accessor.common_feature_value.Size(mf_dim) / sizeof(float)); x++){ - cur[x] = shard_val[x]; - } + feature_value_accessor.FeatureValueFill(cur, shard_val, mf_dim); } } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index 930dafc944371e..473b16bbe48ecb 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -43,58 +43,23 @@ struct DynamicGradMerger { __device__ __forceinline__ void update_one(float* output, const float* input, CommonFeatureValueAccessor& feature_value_accessor) { - output[feature_value_accessor.common_push_value.SlotIndex()] = - input[feature_value_accessor.common_push_value.SlotIndex()]; - output[feature_value_accessor.common_push_value.ShowIndex()] = - input[feature_value_accessor.common_push_value.ShowIndex()]; - output[feature_value_accessor.common_push_value.ClickIndex()] = - input[feature_value_accessor.common_push_value.ClickIndex()]; - output[feature_value_accessor.common_push_value.MfDimIndex()] = - input[feature_value_accessor.common_push_value.MfDimIndex()]; - output[feature_value_accessor.common_push_value.EmbedGIndex()] = - input[feature_value_accessor.common_push_value.EmbedGIndex()]; - for (int j = 0; j < int(output[feature_value_accessor.common_push_value.MfDimIndex()]); j++) { - output[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = - input[feature_value_accessor.common_push_value.EmbedxGIndex() + j]; - } + feature_value_accessor.PushValueFill(output, input); } __device__ __forceinline__ void merge_one(float* output, const float* input, CommonFeatureValueAccessor& feature_value_accessor) { - output[feature_value_accessor.common_push_value.ShowIndex()] += - input[feature_value_accessor.common_push_value.ShowIndex()]; - output[feature_value_accessor.common_push_value.ClickIndex()] += - input[feature_value_accessor.common_push_value.ClickIndex()]; - output[feature_value_accessor.common_push_value.EmbedGIndex()] += - input[feature_value_accessor.common_push_value.EmbedGIndex()]; - for (int j = 0; j < int(output[feature_value_accessor.common_push_value.MfDimIndex()]); j++) { - output[feature_value_accessor.common_push_value.EmbedxGIndex() + j] += - input[feature_value_accessor.common_push_value.EmbedxGIndex() + j]; - } + feature_value_accessor.MergePushValue(output, input); + } __device__ __forceinline__ void update_basic(float* output, const float* input, CommonFeatureValueAccessor& fv_accessor) { - output[fv_accessor.common_push_value.SlotIndex()] = - input[fv_accessor.common_push_value.SlotIndex()]; - output[fv_accessor.common_push_value.ShowIndex()] = - input[fv_accessor.common_push_value.ShowIndex()]; - output[fv_accessor.common_push_value.ClickIndex()] = - input[fv_accessor.common_push_value.ClickIndex()]; - output[fv_accessor.common_push_value.MfDimIndex()] = - input[fv_accessor.common_push_value.MfDimIndex()]; - output[fv_accessor.common_push_value.EmbedGIndex()] = - input[fv_accessor.common_push_value.EmbedGIndex()]; + fv_accessor.PushValueFillBasic(output, input); } __device__ __forceinline__ void merge_basic(float* output, const float* input, CommonFeatureValueAccessor& fv_accessor) { - output[fv_accessor.common_push_value.ShowIndex()] += - input[fv_accessor.common_push_value.ShowIndex()]; - output[fv_accessor.common_push_value.ClickIndex()] += - input[fv_accessor.common_push_value.ClickIndex()]; - output[fv_accessor.common_push_value.EmbedGIndex()] += - input[fv_accessor.common_push_value.EmbedGIndex()]; + fv_accessor.MergePushValueBasic(output, input); } __device__ __forceinline__ void update_embedx(float* output, const float* input, size_t embedx_idx, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index ea97bfe362f0c4..f7f89450158ef5 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -643,42 +643,8 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { #ifdef PADDLE_WITH_PSCORE VLOG(5) << "cpu build "<< k << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) << " |: "<< cpu_table_accessor_->ParseToString(ptr_val, dim); - val[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] = - ptr_val[cpu_table_accessor_->common_feature_value.DeltaScoreIndex()]; - val[feature_value_accessor_.common_feature_value.ShowIndex()] = - ptr_val[cpu_table_accessor_->common_feature_value.ShowIndex()]; - val[feature_value_accessor_.common_feature_value.ClickIndex()] = - ptr_val[cpu_table_accessor_->common_feature_value.ClickIndex()]; - val[feature_value_accessor_.common_feature_value.SlotIndex()] = - ptr_val[cpu_table_accessor_->common_feature_value.SlotIndex()]; - val[feature_value_accessor_.common_feature_value.EmbedWIndex()] = - ptr_val[cpu_table_accessor_->common_feature_value.EmbedWIndex()]; - for (int i = 0; i < feature_value_accessor_.common_feature_value.EmbedDim(); i++) { - val[feature_value_accessor_.common_feature_value.EmbedG2SumIndex() + i] = - ptr_val[cpu_table_accessor_->common_feature_value.EmbedG2SumIndex() + i]; - } - + feature_value_accessor_.BuildFill(val, ptr_val, cpu_table_accessor_, mf_dim, dim); *(reinterpret_cast(val + feature_value_accessor_.common_feature_value.CpuPtrIndex())) = (uint64_t)(device_dim_ptrs[k]); - - ptr_val[cpu_table_accessor_->common_feature_value.MfDimIndex()] = float(mf_dim); - val[feature_value_accessor_.common_feature_value.MfDimIndex()] = mf_dim; - if (dim > cpu_table_accessor_->GetAccessorInfo().dim - - cpu_table_accessor_->GetAccessorInfo().mf_size / sizeof(float)) { - val[feature_value_accessor_.common_feature_value.MfSizeIndex()] = - feature_value_accessor_.common_feature_value.MFSize(mf_dim) / sizeof(float); - - for (int x = 0; x < int(feature_value_accessor_.common_feature_value.MFSize(mf_dim) / sizeof(float)); - x++) { - val[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + x] = - ptr_val[cpu_table_accessor_->common_feature_value.EmbedxG2SumIndex() + x]; - } - } else { - val[feature_value_accessor_.common_feature_value.MfSizeIndex()] = 0; - for (int x = feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(); - x < int(feature_value_accessor_.common_feature_value.Size(mf_dim) / sizeof(float)); x++){ - val[x] = 0; - } - } VLOG(5) << "build "<< k << " : "<< feature_value_accessor_.ParseToString(val, feature_value_accessor_.common_feature_value.Dim(mf_dim)); } #endif @@ -883,30 +849,7 @@ void PSGPUWrapper::EndPass() { } float* cpu_val = downpour_value->data(); - cpu_val[cpu_table_accessor_->common_feature_value.DeltaScoreIndex()] = - gpu_val[feature_value_accessor_.common_feature_value.DeltaScoreIndex()]; - cpu_val[cpu_table_accessor_->common_feature_value.ShowIndex()] = - gpu_val[feature_value_accessor_.common_feature_value.ShowIndex()]; - cpu_val[cpu_table_accessor_->common_feature_value.ClickIndex()] = - gpu_val[feature_value_accessor_.common_feature_value.ClickIndex()]; - cpu_val[cpu_table_accessor_->common_feature_value.EmbedWIndex()] = - gpu_val[feature_value_accessor_.common_feature_value.EmbedWIndex()]; - cpu_val[cpu_table_accessor_->common_feature_value.SlotIndex()] = - gpu_val[feature_value_accessor_.common_feature_value.SlotIndex()]; - - for (int i = 0; i < feature_value_accessor_.common_feature_value.EmbedDim(); i++) { - cpu_val[cpu_table_accessor_->common_feature_value.EmbedG2SumIndex() + i] = - gpu_val[feature_value_accessor_.common_feature_value.EmbedG2SumIndex() + i]; - } - - if (gpu_val[feature_value_accessor_.common_feature_value.MfSizeIndex()] > 0) { - - for (int x = 0; x < int(feature_value_accessor_.common_feature_value.MFSize(mf_dim) / sizeof(float)); - x++) { - cpu_val[cpu_table_accessor_->common_feature_value.EmbedxG2SumIndex() + x] = - gpu_val[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + x]; - } - } + feature_value_accessor_.DumpFill(cpu_val, gpu_val, cpu_table_accessor_, mf_dim); VLOG(5) << "dump to cpu "<< index << " : "<< feature_value_accessor_.ParseToString(gpu_val, feature_value_accessor_.common_feature_value.Dim(mf_dim)) << " ===== CPU:" << cpu_table_accessor_->ParseToString(cpu_val, downpour_value->size()); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 15d22ab57428d7..3cf204716249b0 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -80,26 +80,7 @@ __global__ void PullCopy(float** dest, const float* src, float* feature_value_ptr = (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); int mf_dim = gpu_dim[x] - 3; - if (*(keys[x] + y) == 0) { - *(dest[x] + y * (mf_dim + 3)) = 0; - *(dest[x] + y * (mf_dim + 3) + 1) = 0; - *(dest[x] + y * (mf_dim + 3) + 2) = 0; - } else { - *(dest[x] + y * (mf_dim + 3)) = feature_value_ptr[feature_value_accessor.common_feature_value.ShowIndex()]; - *(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr[feature_value_accessor.common_feature_value.ClickIndex()]; - *(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr[feature_value_accessor.common_feature_value.EmbedWIndex()]; - } - - if (feature_value_ptr[feature_value_accessor.common_feature_value.MfSizeIndex()] == 0 || *(keys[x] + y) == 0) { - for (int j = 0; j < mf_dim; j++) { - *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0; - } - } else { - for (int j = 0; j < mf_dim; j++) { - *(dest[x] + y * (mf_dim + 3) + 3 + j) = - feature_value_ptr[feature_value_accessor.common_feature_value.EmbedxWOffsetIndex(feature_value_ptr) + j]; - } - } + feature_value_accessor.Select(dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); } } From c925e3bfa949907ffb41859ab9f3cf47374c5e38 Mon Sep 17 00:00:00 2001 From: danleifeng Date: Fri, 1 Jul 2022 16:44:02 +0800 Subject: [PATCH 02/12] template;test=develop --- .../framework/fleet/heter_ps/hashtable_kernel.cu | 4 ++-- .../fleet/heter_ps/heter_comm_kernel.cu | 16 ++++++++-------- paddle/fluid/framework/fleet/ps_gpu_wrapper.cu | 6 ++++-- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index a1cdfa660eccff..ade2d69650d5d9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -80,12 +80,12 @@ __global__ void search_kernel(Table* table, } } -template +template __global__ void dy_mf_search_kernel(Table* table, const typename Table::key_type* const keys, char* vals, size_t len, size_t pull_feature_value_size, - CommonFeatureValueAccessor feature_value_accessor) { + FVAceessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; // return; if (i < len) { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index 84946860300abd..d7c6d65d4c4ef7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -117,11 +117,11 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals, } } -template +template __global__ void dy_mf_fill_shard_grads_kernel( KeyType* d_shard_keys, KeyType* d_keys, float* d_shard_grads, float* d_grads, T* idx, size_t len, size_t grad_value_size, - CommonFeatureValueAccessor feature_value_accessor) { + FVAceessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { d_shard_keys[i] = d_keys[idx[i]]; @@ -132,7 +132,7 @@ __global__ void dy_mf_fill_shard_grads_kernel( } } -template +template __global__ void merge_gradients_basic_kernel(const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, @@ -140,7 +140,7 @@ __global__ void merge_gradients_basic_kernel(const KeyType* d_keys, char* output, int n, size_t grad_value_size, DynamicGradMerger& merger, - CommonFeatureValueAccessor& feature_value_accessor) { + FVAceessor& feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { @@ -162,7 +162,7 @@ __global__ void merge_gradients_basic_kernel(const KeyType* d_keys, } } -template +template __global__ void merge_gradients_embedx_kernel(const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, @@ -171,7 +171,7 @@ __global__ void merge_gradients_embedx_kernel(const KeyType* d_keys, size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger, - CommonFeatureValueAccessor& feature_value_accessor) { + FVAceessor& feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { @@ -194,10 +194,10 @@ __global__ void merge_gradients_embedx_kernel(const KeyType* d_keys, } } -template +template __global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals, float* d_vals, T* idx, size_t len, size_t val_size, - CommonFeatureValueAccessor feature_value_accessor) { + FVAceessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { uint64_t new_offset = uint64_t(idx[i]) * val_size; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 3cf204716249b0..2f9d5147fb0e4e 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -61,10 +61,11 @@ __global__ void PullCopy(float** dest, const FeatureValue* src, } } +template __global__ void PullCopy(float** dest, const float* src, const int64_t* len, int slot_num, int total_len, uint64_t** keys, uint64_t max_val_size, int* gpu_dim, - CommonFeatureValueAccessor feature_value_accessor) { + FVAceessor feature_value_accessor) { CUDA_KERNEL_LOOP(i, total_len) { int low = 0; int high = slot_num - 1; @@ -128,11 +129,12 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len, } } +template __global__ void PushCopyWithPool(float* dest, float** src, int64_t* len, int slot_num, uint64_t total_len, int bs, int* slot_vector, int* mf_dim_vector, size_t grad_value_size, - CommonFeatureValueAccessor feature_value_accessor) { + FVAceessor feature_value_accessor) { CUDA_KERNEL_LOOP(i, total_len) { int low = 0; int high = slot_num - 1; From b73bb24e2880530cba0f6a2aa8bd5eae13536455 Mon Sep 17 00:00:00 2001 From: danleifeng Date: Wed, 6 Jul 2022 12:21:13 +0000 Subject: [PATCH 03/12] fix adam accessor:template;test=develop --- .../framework/fleet/heter_ps/CMakeLists.txt | 14 +- .../framework/fleet/heter_ps/feature_value.h | 639 ++++++++++++------ .../fleet/heter_ps/graph_gpu_ps_table.h | 26 +- .../framework/fleet/heter_ps/hashtable.h | 17 +- .../fleet/heter_ps/hashtable_kernel.cu | 72 +- .../framework/fleet/heter_ps/heter_comm.h | 14 +- .../framework/fleet/heter_ps/heter_comm_inl.h | 390 ++++++----- .../fleet/heter_ps/heter_comm_kernel.cu | 140 ++-- .../fleet/heter_ps/heter_comm_kernel.h | 69 +- .../framework/fleet/heter_ps/heter_ps.cc | 32 +- .../framework/fleet/heter_ps/heter_ps.cu | 124 ++-- .../fluid/framework/fleet/heter_ps/heter_ps.h | 16 +- .../framework/fleet/heter_ps/heter_ps_base.h | 14 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 401 ++++++----- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 371 +++++----- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 142 ++-- .../fluid/framework/fleet/ps_gpu_wrapper.kps | 129 ++-- 17 files changed, 1462 insertions(+), 1148 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index d62fc1c084962d..fbbb77a205b9a9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -7,10 +7,16 @@ IF(WITH_GPU) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS}) endif() - nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h DEPS ${HETERPS_DEPS}) - nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h DEPS ${HETERPS_DEPS}) - nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) - nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) + nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS}) + # nv_library(feature_value SRCS feature_value.h DEPS ${HETERPS_DEPS}) + # nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu DEPS ${HETERPS_DEPS} feature_value) + nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS}) + # nv_library(hashtable_kernel SRCS hashtable_kernel.cu DEPS ${HETERPS_DEPS} feature_value) + + nv_library(heter_comm SRCS heter_comm.h feature_value.h feature_value.cu heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) + # nv_library(heter_comm SRCS heter_comm.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) + # nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) + nv_test(test_heter_comm SRCS DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) if(WITH_PSCORE) nv_library(graph_gpu_ps SRCS graph_gpu_ps_table_inl.cu DEPS heter_comm table hashtable_kernel) diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index 2486f34cc0e256..7569c26c132576 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -19,26 +19,35 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" +#include "paddle/fluid/distributed/ps/table/depends/feature_value.h" +#endif namespace paddle { namespace framework { #define MF_DIM 8 typedef uint64_t FeatureKey; +#define TYPEALIGN(ALIGNVAL, LEN) \ + (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) class FeatureValueAccessor { public: - __host__ __device__ FeatureValueAccessor() {} + __host__ __device__ FeatureValueAccessor() {} __host__ __device__ ~FeatureValueAccessor() {} - __host__ __device__ virtual int Configure(std::unordered_map config) { + __host__ __device__ virtual int Configure( + std::unordered_map config) { _config = config; Initialize(); return 0; } - __host__ __device__ virtual int Initialize() = 0; + __host__ __device__ virtual int Initialize() = 0; protected: std::unordered_map _config; @@ -64,47 +73,58 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { std::vector embedx_w; */ - __host__ __device__ int Dim() { return 9 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; } // has cpu_ptr(2) - __host__ __device__ int DimSize(size_t dim, int embedx_dim) { return sizeof(float); } - __host__ __device__ int Size() { return Dim() * sizeof(float); } // cpu_ptr:uint64=2float - __host__ __device__ int EmbedDim() { return embed_sgd_dim;} - __host__ __device__ int EmbedXDim() { return embedx_sgd_dim;} - __host__ __device__ int EmbedWDim() { return embedx_dim;} - __host__ __device__ int CpuPtrIndex() {return 0; } // cpuprt uint64 - __host__ __device__ int DeltaScoreIndex() { return CpuPtrIndex() + 2; } + __host__ __device__ int Dim() { + return 9 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; + } // has cpu_ptr(2) + __host__ __device__ int DimSize(size_t dim, int embedx_dim) { + return sizeof(float); + } + __host__ __device__ size_t Size() { + return TYPEALIGN(8, Dim() * sizeof(float)); + } // cpu_ptr:uint64=2float + __host__ __device__ int EmbedDim() { return embed_sgd_dim; } + __host__ __device__ int EmbedXDim() { return embedx_sgd_dim; } + __host__ __device__ int EmbedWDim() { return embedx_dim; } + __host__ __device__ int CpuPtrIndex() { return 0; } // cpuprt uint64 + __host__ __device__ int DeltaScoreIndex() { return CpuPtrIndex() + 2; } __host__ __device__ int ShowIndex() { return DeltaScoreIndex() + 1; } __host__ __device__ int ClickIndex() { return ShowIndex() + 1; } __host__ __device__ int EmbedWIndex() { return ClickIndex() + 1; } __host__ __device__ int EmbedG2SumIndex() { return EmbedWIndex() + 1; } - __host__ __device__ int SlotIndex() { return EmbedG2SumIndex() + embed_sgd_dim; } + __host__ __device__ int SlotIndex() { + return EmbedG2SumIndex() + embed_sgd_dim; + } __host__ __device__ int MfDimIndex() { return SlotIndex() + 1; } - __host__ __device__ int MfSizeIndex() { return MfDimIndex() + 1; } // actual mf size (ex. 0) + __host__ __device__ int MfSizeIndex() { + return MfDimIndex() + 1; + } // actual mf size (ex. 0) __host__ __device__ int EmbedxG2SumIndex() { return MfSizeIndex() + 1; } - __host__ __device__ int EmbedxWIndex() { return EmbedxG2SumIndex() + embedx_sgd_dim; } - + __host__ __device__ int EmbedxWIndex() { + return EmbedxG2SumIndex() + embedx_sgd_dim; + } // 根据mf_dim计算的总长度 __host__ __device__ int Dim(int& mf_dim) { int tmp_embedx_sgd_dim = 1; - if (optimizer_type_ == 3) {//adam + if (optimizer_type_ == 3) { // adam tmp_embedx_sgd_dim = mf_dim * 2 + 2; - } else if (optimizer_type_ == 4) { //shared_adam + } else if (optimizer_type_ == 4) { // shared_adam tmp_embedx_sgd_dim = 4; } return 9 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim; } // 根据mf_dim 计算的总byte数 - __host__ __device__ int Size(int& mf_dim) { - return Dim(mf_dim) * sizeof(float); // cpu_ptr:2float + __host__ __device__ size_t Size(int& mf_dim) { + return TYPEALIGN(8, Dim(mf_dim) * sizeof(float)); // cpu_ptr:2float } // 根据mf_dim 计算的 mf_size byte数 - __host__ __device__ int MFSize(int& mf_dim) { + __host__ __device__ size_t MFSize(int& mf_dim) { int tmp_embedx_sgd_dim = 1; - if (optimizer_type_ == 3) { //adam + if (optimizer_type_ == 3) { // adam tmp_embedx_sgd_dim = mf_dim * 2 + 2; - } else if (optimizer_type_ == 4) { //shared_adam + } else if (optimizer_type_ == 4) { // shared_adam tmp_embedx_sgd_dim = 4; } return (tmp_embedx_sgd_dim + mf_dim) * sizeof(float); @@ -112,33 +132,42 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { __host__ __device__ int EmbedxG2SumOffsetIndex() { return 0; } __host__ __device__ int EmbedxWOffsetIndex(float* val) { - // has mf + // has mf int tmp_embedx_sgd_dim = 1; if (int(MfSize(val)) > 0) { - if (optimizer_type_ == 3) {//adam + if (optimizer_type_ == 3) { // adam tmp_embedx_sgd_dim = int(MfDim(val)) * 2 + 2; - } else if (optimizer_type_ == 4) { //shared_adam + } else if (optimizer_type_ == 4) { // shared_adam tmp_embedx_sgd_dim = 4; } - return EmbedxG2SumIndex() + tmp_embedx_sgd_dim; + return EmbedxG2SumIndex() + tmp_embedx_sgd_dim; } else { // no mf return 0; } } - - __host__ __device__ uint64_t CpuPtr(float* val) {return *(reinterpret_cast(val)); } - __host__ __device__ float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; } + __host__ __device__ uint64_t CpuPtr(float* val) { + return *(reinterpret_cast(val)); + } + __host__ __device__ float& DeltaScore(float* val) { + return val[DeltaScoreIndex()]; + } __host__ __device__ float& Show(float* val) { return val[ShowIndex()]; } __host__ __device__ float& Click(float* val) { return val[ClickIndex()]; } __host__ __device__ float& Slot(float* val) { return val[SlotIndex()]; } __host__ __device__ float& MfDim(float* val) { return val[MfDimIndex()]; } __host__ __device__ float& MfSize(float* val) { return val[MfSizeIndex()]; } __host__ __device__ float& EmbedW(float* val) { return val[EmbedWIndex()]; } - __host__ __device__ float& EmbedG2Sum(float* val) { return val[EmbedG2SumIndex()]; } - __host__ __device__ float& EmbedxG2Sum(float* val) { return val[EmbedxG2SumIndex()]; } - __host__ __device__ float& EmbedxW(float* val) { return val[EmbedxWIndex()]; } + __host__ __device__ float& EmbedG2Sum(float* val) { + return val[EmbedG2SumIndex()]; + } + __host__ __device__ float& EmbedxG2Sum(float* val) { + return val[EmbedxG2SumIndex()]; + } + __host__ __device__ float& EmbedxW(float* val) { + return val[EmbedxWIndex()]; + } int embed_sgd_dim; int embedx_dim; @@ -158,14 +187,28 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { __host__ __device__ int Dim(int embedx_dim) { return 5 + embedx_dim; } - __host__ __device__ int DimSize(int dim, int embedx_dim) { return sizeof(float); } - __host__ __device__ int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + __host__ __device__ int DimSize(int dim, int embedx_dim) { + return sizeof(float); + } + __host__ __device__ int Size(int embedx_dim) { + return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float)); + } __host__ __device__ int SlotIndex() { return 0; } - __host__ __device__ int ShowIndex() { return CommonPushValue::SlotIndex() + 1; } - __host__ __device__ int ClickIndex() { return CommonPushValue::ShowIndex() + 1; } - __host__ __device__ int MfDimIndex() { return CommonPushValue::ClickIndex() + 1; } - __host__ __device__ int EmbedGIndex() { return CommonPushValue::MfDimIndex() + 1; } - __host__ __device__ int EmbedxGIndex() { return CommonPushValue::EmbedGIndex() + 1; } + __host__ __device__ int ShowIndex() { + return CommonPushValue::SlotIndex() + 1; + } + __host__ __device__ int ClickIndex() { + return CommonPushValue::ShowIndex() + 1; + } + __host__ __device__ int MfDimIndex() { + return CommonPushValue::ClickIndex() + 1; + } + __host__ __device__ int EmbedGIndex() { + return CommonPushValue::MfDimIndex() + 1; + } + __host__ __device__ int EmbedxGIndex() { + return CommonPushValue::EmbedGIndex() + 1; + } __host__ __device__ float& Slot(float* val) { return val[CommonPushValue::SlotIndex()]; } @@ -194,9 +237,13 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { std::vector embedx_w; */ - __host__ __device__ static int Dim(int embedx_dim) { return 3 + embedx_dim; } + __host__ __device__ static int Dim(int embedx_dim) { + return 3 + embedx_dim; + } __host__ __device__ int DimSize(size_t dim) { return sizeof(float); } - __host__ __device__ int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + __host__ __device__ int Size(int embedx_dim) { + return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float)); + } __host__ __device__ int ShowIndex() { return 0; } __host__ __device__ int ClickIndex() { return 1; } __host__ __device__ int EmbedWIndex() { return 2; } @@ -215,21 +262,20 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { } }; - __host__ __device__ CommonFeatureValueAccessor() {} __host__ __device__ ~CommonFeatureValueAccessor() {} __host__ __device__ virtual int Initialize() { int optimizer_type = (_config.find("optimizer_type") == _config.end()) - ? 1 - : int(_config["optimizer_type"]); + ? 1 + : int(_config["optimizer_type"]); int sparse_embedx_dim = (_config.find("embedx_dim") == _config.end()) ? 8 : int(_config["embedx_dim"]); - if (optimizer_type == 3) { //adam + if (optimizer_type == 3) { // adam common_feature_value.embed_sgd_dim = 4; common_feature_value.embedx_sgd_dim = sparse_embedx_dim * 2 + 2; - } else if (optimizer_type == 4) { //shared_adam + } else if (optimizer_type == 4) { // shared_adam common_feature_value.embed_sgd_dim = 4; common_feature_value.embedx_sgd_dim = 4; } else { @@ -242,168 +288,209 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { return 0; } - -// build阶段从cpu_val赋值给gpu_val -__host__ __device__ void BuildFill(float* gpu_val, - float* cpu_val, - paddle::distributed::CtrDymfAccessor* cpu_table_accessor, - int mf_dim, - size_t cpu_fv_dim) { - - gpu_val[common_feature_value.DeltaScoreIndex()] = - cpu_val[cpu_table_accessor->common_feature_value.DeltaScoreIndex()]; - gpu_val[common_feature_value.ShowIndex()] = - cpu_val[cpu_table_accessor->common_feature_value.ShowIndex()]; - gpu_val[common_feature_value.ClickIndex()] = - cpu_val[cpu_table_accessor->common_feature_value.ClickIndex()]; - gpu_val[common_feature_value.SlotIndex()] = - cpu_val[cpu_table_accessor->common_feature_value.SlotIndex()]; - gpu_val[common_feature_value.EmbedWIndex()] = - cpu_val[cpu_table_accessor->common_feature_value.EmbedWIndex()]; - for (int i = 0; i < common_feature_value.EmbedDim(); i++) { - gpu_val[common_feature_value.EmbedG2SumIndex() + i] = - cpu_val[cpu_table_accessor->common_feature_value.EmbedG2SumIndex() + i]; + // // build阶段从cpu_val赋值给gpu_val + __host__ void BuildFill( + float* gpu_val, void* cpu, + paddle::distributed::CtrDymfAccessor* cpu_table_accessor, int mf_dim) { +#ifdef PADDLE_WITH_PSCORE + paddle::distributed::FixedFeatureValue* cpu_ptr = + (paddle::distributed::FixedFeatureValue*)(cpu); + float* cpu_val = cpu_ptr->data(); + size_t cpu_dim = cpu_ptr->size(); + + gpu_val[common_feature_value.DeltaScoreIndex()] = + cpu_val[cpu_table_accessor->common_feature_value.DeltaScoreIndex()]; + gpu_val[common_feature_value.ShowIndex()] = + cpu_val[cpu_table_accessor->common_feature_value.ShowIndex()]; + gpu_val[common_feature_value.ClickIndex()] = + cpu_val[cpu_table_accessor->common_feature_value.ClickIndex()]; + gpu_val[common_feature_value.SlotIndex()] = + cpu_val[cpu_table_accessor->common_feature_value.SlotIndex()]; + gpu_val[common_feature_value.EmbedWIndex()] = + cpu_val[cpu_table_accessor->common_feature_value.EmbedWIndex()]; + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + gpu_val[common_feature_value.EmbedG2SumIndex() + i] = + cpu_val[cpu_table_accessor->common_feature_value.EmbedG2SumIndex() + + i]; + } + *(reinterpret_cast( + gpu_val + common_feature_value.CpuPtrIndex())) = (uint64_t)(cpu); + cpu_val[cpu_table_accessor->common_feature_value.MfDimIndex()] = + float(mf_dim); + gpu_val[common_feature_value.MfDimIndex()] = mf_dim; + if (cpu_dim > + cpu_table_accessor->GetAccessorInfo().dim - + cpu_table_accessor->GetAccessorInfo().mf_size / sizeof(float)) { + gpu_val[common_feature_value.MfSizeIndex()] = + common_feature_value.MFSize(mf_dim) / sizeof(float); + + for (int x = 0; + x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); x++) { + gpu_val[common_feature_value.EmbedxG2SumIndex() + x] = cpu_val + [cpu_table_accessor->common_feature_value.EmbedxG2SumIndex() + x]; + } + } else { + gpu_val[common_feature_value.MfSizeIndex()] = 0; + for (int x = common_feature_value.EmbedxG2SumIndex(); + x < int(common_feature_value.Size(mf_dim) / sizeof(float)); x++) { + gpu_val[x] = 0; + } + } +#endif } - cpu_val[cpu_table_accessor->common_feature_value.MfDimIndex()] = float(mf_dim); - gpu_val[common_feature_value.MfDimIndex()] = mf_dim; - if (cpu_fv_dim > cpu_table_accessor->GetAccessorInfo().dim - - cpu_table_accessor->GetAccessorInfo().mf_size / sizeof(float)) { - gpu_val[common_feature_value.MfSizeIndex()] = - common_feature_value.MFSize(mf_dim) / sizeof(float); - - for (int x = 0; x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); - x++) { - gpu_val[common_feature_value.EmbedxG2SumIndex() + x] = - cpu_val[cpu_table_accessor->common_feature_value.EmbedxG2SumIndex() + x]; + // dump_to_cpu阶段从gpu_val赋值给cpu_val + __host__ __device__ void DumpFill( + float* gpu_val, paddle::distributed::CtrDymfAccessor* cpu_table_accessor, + int mf_dim) { +#ifdef PADDLE_WITH_PSCORE + auto* downpour_value = + (paddle::distributed::FixedFeatureValue*)(*(reinterpret_cast( + gpu_val + common_feature_value.CpuPtrIndex()))); + size_t downpour_value_size = downpour_value->size(); + if (gpu_val[common_feature_value.MfSizeIndex()] > 0 && + downpour_value_size == + (cpu_table_accessor->GetAccessorInfo().dim - + int(cpu_table_accessor->GetAccessorInfo().mf_size / + sizeof(float)))) { // cpu_accessor + downpour_value->resize( + cpu_table_accessor->common_feature_value.Dim(mf_dim)); } - } else { - gpu_val[common_feature_value.MfSizeIndex()] = 0; - for (int x = common_feature_value.EmbedxG2SumIndex(); - x < int(common_feature_value.Size(mf_dim) / sizeof(float)); x++){ - gpu_val[x] = 0; + float* cpu_val = downpour_value->data(); + cpu_val[cpu_table_accessor->common_feature_value.DeltaScoreIndex()] = + gpu_val[common_feature_value.DeltaScoreIndex()]; + cpu_val[cpu_table_accessor->common_feature_value.ShowIndex()] = + gpu_val[common_feature_value.ShowIndex()]; + cpu_val[cpu_table_accessor->common_feature_value.ClickIndex()] = + gpu_val[common_feature_value.ClickIndex()]; + cpu_val[cpu_table_accessor->common_feature_value.EmbedWIndex()] = + gpu_val[common_feature_value.EmbedWIndex()]; + cpu_val[cpu_table_accessor->common_feature_value.SlotIndex()] = + gpu_val[common_feature_value.SlotIndex()]; + + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + cpu_val[cpu_table_accessor->common_feature_value.EmbedG2SumIndex() + i] = + gpu_val[common_feature_value.EmbedG2SumIndex() + i]; } - } -} - - -// dump_to_cpu阶段从gpu_val赋值给cpu_val -__host__ __device__ void DumpFill(float* cpu_val, - float* gpu_val, - paddle::distributed::CtrDymfAccessor* cpu_table_accessor, - int mf_dim) { - - cpu_val[cpu_table_accessor->common_feature_value.DeltaScoreIndex()] = - gpu_val[common_feature_value.DeltaScoreIndex()]; - cpu_val[cpu_table_accessor->common_feature_value.ShowIndex()] = - gpu_val[common_feature_value.ShowIndex()]; - cpu_val[cpu_table_accessor->common_feature_value.ClickIndex()] = - gpu_val[common_feature_value.ClickIndex()]; - cpu_val[cpu_table_accessor->common_feature_value.EmbedWIndex()] = - gpu_val[common_feature_value.EmbedWIndex()]; - cpu_val[cpu_table_accessor->common_feature_value.SlotIndex()] = - gpu_val[common_feature_value.SlotIndex()]; - - for (int i = 0; i < common_feature_value.EmbedDim(); i++) { - cpu_val[cpu_table_accessor->common_feature_value.EmbedG2SumIndex() + i] = - gpu_val[common_feature_value.EmbedG2SumIndex() + i]; - } - if (gpu_val[common_feature_value.MfSizeIndex()] > 0) { - - for (int x = 0; x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); - x++) { - cpu_val[cpu_table_accessor->common_feature_value.EmbedxG2SumIndex() + x] = - gpu_val[common_feature_value.EmbedxG2SumIndex() + x]; + if (gpu_val[common_feature_value.MfSizeIndex()] > 0) { + for (int x = 0; + x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); x++) { + cpu_val[cpu_table_accessor->common_feature_value.EmbedxG2SumIndex() + + x] = gpu_val[common_feature_value.EmbedxG2SumIndex() + x]; + } } +#endif } -} - - -// dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel 中从src_val赋值给dest_val -__host__ __device__ void FeatureValueFill(float* dest_val, - float* src_val, - int mf_dim) { - *(reinterpret_cast(dest_val + common_feature_value.CpuPtrIndex())) = - *(reinterpret_cast(src_val + common_feature_value.CpuPtrIndex())); - dest_val[common_feature_value.DeltaScoreIndex()] = src_val[common_feature_value.DeltaScoreIndex()]; - dest_val[common_feature_value.ShowIndex()] = src_val[common_feature_value.ShowIndex()]; - dest_val[common_feature_value.ClickIndex()] = src_val[common_feature_value.ClickIndex()]; - dest_val[common_feature_value.EmbedWIndex()] = src_val[common_feature_value.EmbedWIndex()]; - for (int i = 0; i < common_feature_value.EmbedDim(); i++) { - dest_val[common_feature_value.EmbedG2SumIndex() + i] = - src_val[common_feature_value.EmbedG2SumIndex() + i]; - } - dest_val[common_feature_value.SlotIndex()] = src_val[common_feature_value.SlotIndex()]; - dest_val[common_feature_value.MfDimIndex()] = mf_dim; - dest_val[common_feature_value.MfSizeIndex()] = src_val[common_feature_value.MfSizeIndex()]; - for (int x = common_feature_value.EmbedxG2SumIndex(); - x < int(common_feature_value.Size(mf_dim) / sizeof(float)); x++){ - dest_val[x] = src_val[x]; + // dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel + // 中从src_val赋值给dest_val + __host__ __device__ void FeatureValueFill(float* dest_val, float* src_val, + int mf_dim) { + *(reinterpret_cast(dest_val + + common_feature_value.CpuPtrIndex())) = + *(reinterpret_cast(src_val + + common_feature_value.CpuPtrIndex())); + dest_val[common_feature_value.DeltaScoreIndex()] = + src_val[common_feature_value.DeltaScoreIndex()]; + dest_val[common_feature_value.ShowIndex()] = + src_val[common_feature_value.ShowIndex()]; + dest_val[common_feature_value.ClickIndex()] = + src_val[common_feature_value.ClickIndex()]; + dest_val[common_feature_value.EmbedWIndex()] = + src_val[common_feature_value.EmbedWIndex()]; + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + dest_val[common_feature_value.EmbedG2SumIndex() + i] = + src_val[common_feature_value.EmbedG2SumIndex() + i]; + } + dest_val[common_feature_value.SlotIndex()] = + src_val[common_feature_value.SlotIndex()]; + dest_val[common_feature_value.MfDimIndex()] = mf_dim; + dest_val[common_feature_value.MfSizeIndex()] = + src_val[common_feature_value.MfSizeIndex()]; + + for (int x = common_feature_value.EmbedxG2SumIndex(); + x < int(common_feature_value.Size(mf_dim) / sizeof(float)); x++) { + dest_val[x] = src_val[x]; + } } -} + // dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel + // 中从src_val赋值给dest_val + __host__ __device__ void PushValueFill(float* dest_val, + const float* src_val) { + dest_val[common_push_value.SlotIndex()] = + src_val[common_push_value.SlotIndex()]; + dest_val[common_push_value.ShowIndex()] = + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] = + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.MfDimIndex()] = + src_val[common_push_value.MfDimIndex()]; + dest_val[common_push_value.EmbedGIndex()] = + src_val[common_push_value.EmbedGIndex()]; + + for (int x = 0; x < int(src_val[common_push_value.MfDimIndex()]); x++) { + dest_val[common_push_value.EmbedxGIndex() + x] = + src_val[common_push_value.EmbedxGIndex() + x]; + } + } -// dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel 中从src_val赋值给dest_val -__host__ __device__ void PushValueFill(float* dest_val, - const float* src_val) { - dest_val[common_push_value.SlotIndex()] = src_val[common_push_value.SlotIndex()]; - dest_val[common_push_value.ShowIndex()] = src_val[common_push_value.ShowIndex()]; - dest_val[common_push_value.ClickIndex()] = src_val[common_push_value.ClickIndex()]; - dest_val[common_push_value.MfDimIndex()] = src_val[common_push_value.MfDimIndex()]; - dest_val[common_push_value.EmbedGIndex()] = src_val[common_push_value.EmbedGIndex()]; + // update_basic 阶段 gpukernel 中从src_val赋值给dest_val + __host__ __device__ void PushValueFillBasic(float* dest_val, + const float* src_val) { + dest_val[common_push_value.SlotIndex()] = + src_val[common_push_value.SlotIndex()]; + dest_val[common_push_value.ShowIndex()] = + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] = + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.MfDimIndex()] = + src_val[common_push_value.MfDimIndex()]; + dest_val[common_push_value.EmbedGIndex()] = + src_val[common_push_value.EmbedGIndex()]; + } - for (int x = 0; x < int(src_val[common_push_value.MfDimIndex()]); x++) { - dest_val[common_push_value.EmbedxGIndex() + x] = src_val[common_push_value.EmbedxGIndex() + x]; + // merge_one 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val + __host__ __device__ void MergePushValue(float* dest_val, + const float* src_val) { + dest_val[common_push_value.ShowIndex()] += + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] += + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.EmbedGIndex()] += + src_val[common_push_value.EmbedGIndex()]; + for (int j = 0; j < int(dest_val[common_push_value.MfDimIndex()]); j++) { + dest_val[common_push_value.EmbedxGIndex() + j] += + src_val[common_push_value.EmbedxGIndex() + j]; + } } -} - -// update_basic 阶段 gpukernel 中从src_val赋值给dest_val -__host__ __device__ void PushValueFillBasic(float* dest_val, - const float* src_val) { - dest_val[common_push_value.SlotIndex()] = src_val[common_push_value.SlotIndex()]; - dest_val[common_push_value.ShowIndex()] = src_val[common_push_value.ShowIndex()]; - dest_val[common_push_value.ClickIndex()] = src_val[common_push_value.ClickIndex()]; - dest_val[common_push_value.MfDimIndex()] = src_val[common_push_value.MfDimIndex()]; - dest_val[common_push_value.EmbedGIndex()] = src_val[common_push_value.EmbedGIndex()]; - -} - - -// merge_one 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val -__host__ __device__ void MergePushValue(float* dest_val, - const float* src_val) { - dest_val[common_push_value.ShowIndex()] += src_val[common_push_value.ShowIndex()]; - dest_val[common_push_value.ClickIndex()] += src_val[common_push_value.ClickIndex()]; - dest_val[common_push_value.EmbedGIndex()] += src_val[common_push_value.EmbedGIndex()]; - for (int j = 0; j < int(dest_val[common_push_value.MfDimIndex()]); j++) { - dest_val[common_push_value.EmbedxGIndex() + j] += src_val[common_push_value.EmbedxGIndex() + j]; + + // merge_basic 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val + __host__ __device__ void MergePushValueBasic(float* dest_val, + const float* src_val) { + dest_val[common_push_value.ShowIndex()] += + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] += + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.EmbedGIndex()] += + src_val[common_push_value.EmbedGIndex()]; } -} - - -// merge_basic 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val -__host__ __device__ void MergePushValueBasic(float* dest_val, - const float* src_val) { - dest_val[common_push_value.ShowIndex()] += src_val[common_push_value.ShowIndex()]; - dest_val[common_push_value.ClickIndex()] += src_val[common_push_value.ClickIndex()]; - dest_val[common_push_value.EmbedGIndex()] += src_val[common_push_value.EmbedGIndex()]; -} - -// PullCopy 阶段 gpukernel 中 FeatureValue回填到PullValue -__host__ __device__ void Select(float* dest_val, - float* src_val, - uint64_t* key, - int mf_dim) { - if (*key == 0) { + + // PullCopy 阶段 gpukernel 中 FeatureValue回填到PullValue + __host__ __device__ void Select(float* dest_val, float* src_val, + uint64_t* key, int mf_dim) { + if (*key == 0) { *(dest_val + common_pull_value.ShowIndex()) = 0; *(dest_val + common_pull_value.ClickIndex()) = 0; *(dest_val + common_pull_value.EmbedWIndex()) = 0; } else { - *(dest_val + common_pull_value.ShowIndex()) = src_val[common_feature_value.ShowIndex()]; - *(dest_val + common_pull_value.ClickIndex()) = src_val[common_feature_value.ClickIndex()]; - *(dest_val + common_pull_value.EmbedWIndex()) = src_val[common_feature_value.EmbedWIndex()]; + *(dest_val + common_pull_value.ShowIndex()) = + src_val[common_feature_value.ShowIndex()]; + *(dest_val + common_pull_value.ClickIndex()) = + src_val[common_feature_value.ClickIndex()]; + *(dest_val + common_pull_value.EmbedWIndex()) = + src_val[common_feature_value.EmbedWIndex()]; } if (src_val[common_feature_value.MfSizeIndex()] == 0 || *key == 0) { @@ -412,14 +499,14 @@ __host__ __device__ void Select(float* dest_val, } } else { for (int j = 0; j < mf_dim; j++) { - *(dest_val + common_pull_value.EmbedxWIndex() + j) = + *(dest_val + common_pull_value.EmbedxWIndex() + j) = src_val[common_feature_value.EmbedxWOffsetIndex(src_val) + j]; } } -} - + } - __host__ __device__ std::string ParseToString(const float* v, int param_size) { + __host__ __device__ std::string ParseToString(const float* v, + int param_size) { /* uint64_t cpu_ptr; // 2float float delta_score; @@ -434,21 +521,21 @@ __host__ __device__ void Select(float* dest_val, std::vector embedx_w; */ std::stringstream os; - os << "cpuptr: " << common_feature_value.CpuPtr(const_cast(v)) << " delta_score: " << v[2] - << " show: " << v[3] << " click: " << v[4] - << " embed_w:" << v[5] << " embed_g2sum:"; + os << "cpuptr: " << common_feature_value.CpuPtr(const_cast(v)) + << " delta_score: " << v[2] << " show: " << v[3] << " click: " << v[4] + << " embed_w:" << v[5] << " embed_g2sum:"; for (int i = common_feature_value.EmbedG2SumIndex(); - i < common_feature_value.SlotIndex(); i++) { + i < common_feature_value.SlotIndex(); i++) { os << " " << v[i]; } int mf_dim = int(common_feature_value.MfDim(const_cast(v))); - os << " slot: " << common_feature_value.Slot(const_cast(v)) - << " mf_dim: " << mf_dim - << " mf_size: " << common_feature_value.MfSize(const_cast(v)) - << " mf: "; + os << " slot: " << common_feature_value.Slot(const_cast(v)) + << " mf_dim: " << mf_dim + << " mf_size: " << common_feature_value.MfSize(const_cast(v)) + << " mf: "; if (param_size > common_feature_value.EmbedxG2SumIndex()) { for (auto i = common_feature_value.EmbedxG2SumIndex(); - i < common_feature_value.Dim(mf_dim); ++i) { + i < common_feature_value.Dim(mf_dim); ++i) { os << " " << v[i]; } } @@ -461,7 +548,6 @@ __host__ __device__ void Select(float* dest_val, CommonPullValue common_pull_value; }; - struct FeatureValue { float delta_score; float show; @@ -533,6 +619,149 @@ struct FeaturePushValue { } }; +class VirtualAccessor { + public: + virtual int Configure(std::unordered_map config) = 0; + + virtual size_t GetFeatureValueSize(int& mf_dim) = 0; + + virtual size_t GetPushValueSize(int& mf_dim) = 0; + + // TODO: 在基类里调用cpu_table_accessor类型 + virtual void BuildFill( + void* gpu_val, void* cpu_val, + paddle::distributed::CtrDymfAccessor* cpu_table_accessor, int mf_dim) = 0; + + // TODO: 在基类里调用cpu_table_accessor类型 + virtual void DumpFill( + float* gpu_val, paddle::distributed::CtrDymfAccessor* cpu_table_accessor, + int mf_dim) = 0; + + virtual void CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, const int slot_num, + const int hidden_size, const int64_t total_length, + int* gpu_dim, int feature_value_size) = 0; + + virtual void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) = 0; + + virtual std::string ParseToString(const float* v, int param_size) = 0; +}; + +template +class AccessorWrapper : public VirtualAccessor { + public: + explicit AccessorWrapper() {} + virtual ~AccessorWrapper() {} + AccessorWrapper(const AccessorWrapper&) = delete; + AccessorWrapper& operator=(const AccessorWrapper&) = delete; + + virtual int Configure(std::unordered_map config) { + return gpu_accessor_.Configure(config); + } + + virtual size_t GetFeatureValueSize(int& mf_dim) { + return gpu_accessor_.common_feature_value.Size(mf_dim); + } + + virtual size_t GetPushValueSize(int& mf_dim) { + return gpu_accessor_.common_push_value.Size(mf_dim); + } + + virtual void BuildFill( + void* gpu_val, void* cpu_val, + paddle::distributed::CtrDymfAccessor* cpu_table_accessor, int mf_dim) { + gpu_accessor_.BuildFill((float*)(gpu_val), cpu_val, cpu_table_accessor, + mf_dim); + } + + virtual void DumpFill( + float* gpu_val, paddle::distributed::CtrDymfAccessor* cpu_table_accessor, + int mf_dim) { + gpu_accessor_.DumpFill(gpu_val, cpu_table_accessor, mf_dim); + } + + virtual void CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, const int slot_num, + const int hidden_size, const int64_t total_length, + int* gpu_dim, int feature_value_size) { + CopyForPullImpl(place, gpu_keys, values, total_values_gpu, gpu_len, + slot_num, hidden_size, total_length, gpu_dim, + feature_value_size); + } + + virtual void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) { + CopyForPushImpl(place, grad_values, total_grad_values_gpu, slot_lengths, + total_length, batch_size, grad_value_size, slot_vector, + slot_mf_dim_vector); + } + + void CopyForPullImpl(const paddle::platform::Place& place, + uint64_t** gpu_keys, const std::vector& values, + const float* total_values_gpu, const int64_t* gpu_len, + const int slot_num, const int hidden_size, + const int64_t total_length, int* gpu_dim, + int feature_value_size); + + void CopyForPushImpl(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, const int batch_size, + size_t grad_value_size, std::vector& slot_vector, + std::vector& slot_mf_dim_vector); + + virtual std::string ParseToString(const float* v, int param_size) { + return gpu_accessor_.ParseToString(v, param_size); + } + + GPUAccessor gpu_accessor_; +}; + +class GlobalAccessorTransfor { + public: + static GlobalAccessorTransfor& GetInstance() { + static GlobalAccessorTransfor ins; + return ins; + } + void Init(std::string accessor_type) { + if (accessor_wrapper_ptr_ != nullptr) { + return; + } + if (accessor_type == "CtrDymfAccessor") { + accessor_wrapper_ptr_ = new AccessorWrapper(); + } else { + VLOG(0) << "GlobalAccessorTransfor Init not support accessor_type:" + << accessor_type; + accessor_wrapper_ptr_ = new AccessorWrapper(); + } + } + VirtualAccessor* GetAccessorWrapper() { return accessor_wrapper_ptr_; } + + private: + VirtualAccessor* accessor_wrapper_ptr_ = nullptr; +}; + } // end namespace framework } // end namespace paddle + #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index 11a52d631729cd..a0acad9563ef09 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -27,7 +27,8 @@ DECLARE_double(gpugraph_hbm_table_load_factor); namespace paddle { namespace framework { enum GraphTableType { EDGE_TABLE, FEATURE_TABLE }; -class GpuPsGraphTable : public HeterComm { +class GpuPsGraphTable + : public HeterComm { public: int get_table_offset(int gpu_id, GraphTableType type, int idx) const { int type_id = type; @@ -36,7 +37,8 @@ class GpuPsGraphTable : public HeterComm { } GpuPsGraphTable(std::shared_ptr resource, int topo_aware, int graph_table_num) - : HeterComm(1, resource) { + : HeterComm( + 1, resource) { load_factor_ = FLAGS_gpugraph_hbm_table_load_factor; VLOG(0) << "load_factor = " << load_factor_; @@ -108,8 +110,7 @@ class GpuPsGraphTable : public HeterComm { } } } - ~GpuPsGraphTable() { - } + ~GpuPsGraphTable() {} void build_graph_on_single_gpu(const GpuPsCommGraph &g, int gpu_id, int idx); void build_graph_fea_on_single_gpu(const GpuPsCommGraphFea &g, int gpu_id); void clear_graph_info(int gpu_id, int index); @@ -118,8 +119,8 @@ class GpuPsGraphTable : public HeterComm { void clear_feature_info(int index); void build_graph_from_cpu(const std::vector &cpu_node_list, int idx); - void build_graph_fea_from_cpu(const std::vector &cpu_node_list, - int idx); + void build_graph_fea_from_cpu( + const std::vector &cpu_node_list, int idx); NodeQueryResult graph_node_sample(int gpu_id, int sample_size); NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q, bool cpu_switch); @@ -129,17 +130,16 @@ class GpuPsGraphTable : public HeterComm { uint64_t *key, int sample_size, int len, bool cpu_query_switch); - int get_feature_of_nodes(int gpu_id, uint64_t* d_walk, - uint64_t* d_offset, int size, int slot_num); + int get_feature_of_nodes(int gpu_id, uint64_t *d_walk, uint64_t *d_offset, + int size, int slot_num); NodeQueryResult query_node_list(int gpu_id, int idx, int start, int query_size); void display_sample_res(void *key, void *val, int len, int sample_len); - void move_result_to_source_gpu(int gpu_id, int gpu_num, - int sample_size, int *h_left, - int *h_right, - uint64_t *src_sample_res, - int *actual_sample_size); + void move_result_to_source_gpu(int gpu_id, int gpu_num, int sample_size, + int *h_left, int *h_right, + uint64_t *src_sample_res, + int *actual_sample_size); int init_cpu_table(const paddle::distributed::GraphParameter &graph); int gpu_num; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index d63060cc5e3910..0d54f87aef0b5a 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -126,8 +126,9 @@ class HashTable { void get(const KeyType* d_keys, ValType* d_vals, size_t len, StreamType stream); - template - void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream); + template + void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream, + FVAccessor& fv_accessor); void show(); @@ -140,8 +141,8 @@ class HashTable { #if defined(PADDLE_WITH_CUDA) template - void update(const KeyType* d_keys, const float* d_grads, size_t len, - Sgd sgd, StreamType stream); + void update(const KeyType* d_keys, const float* d_grads, size_t len, Sgd sgd, + StreamType stream); template void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd, @@ -168,14 +169,14 @@ class HashTable { << " push value size: " << push_grad_value_size_; } - void set_accessor(CommonFeatureValueAccessor& accessor) { - feature_value_accessor_ = accessor; - } + // void set_accessor(FVAccessor& accessor) { + // feature_value_accessor_ = accessor; + // } void show_collision(int id) { return container_->print_collision(id); } std::unique_ptr rwlock_{nullptr}; - CommonFeatureValueAccessor feature_value_accessor_; + // FVAccessor feature_value_accessor_; private: #if defined(PADDLE_WITH_CUDA) diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index ade2d69650d5d9..e66948b012aa0c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -80,12 +80,12 @@ __global__ void search_kernel(Table* table, } } -template +template __global__ void dy_mf_search_kernel(Table* table, const typename Table::key_type* const keys, char* vals, size_t len, size_t pull_feature_value_size, - FVAceessor feature_value_accessor) { + FVAccessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; // return; if (i < len) { @@ -95,7 +95,8 @@ __global__ void dy_mf_search_kernel(Table* table, uint64_t offset = i * pull_feature_value_size; float* cur = (float*)(vals + offset); float* input = it->second; - int mf_dim = int(input[feature_value_accessor.common_feature_value.MfDimIndex()]); + int mf_dim = + int(input[feature_value_accessor.common_feature_value.MfDimIndex()]); feature_value_accessor.FeatureValueFill(cur, input, mf_dim); } @@ -183,15 +184,16 @@ void HashTable::get(const KeyType* d_keys, ValType* d_vals, } template -template +template void HashTable::get(const KeyType* d_keys, char* d_vals, - size_t len, StreamType stream) { + size_t len, StreamType stream, + FVAccessor& fv_accessor) { if (len == 0) { return; } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; dy_mf_search_kernel<<>>( - container_, d_keys, d_vals, len, pull_feature_value_size_, feature_value_accessor_); + container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor); } template @@ -319,14 +321,14 @@ template class HashTable; template class HashTable; template class HashTable; -template void HashTable::get< - cudaStream_t>(const unsigned long* d_keys, - float* d_vals, size_t len, - cudaStream_t stream); +template void HashTable::get( + const unsigned long* d_keys, float* d_vals, size_t len, + cudaStream_t stream); template void -HashTable::get( - const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream); +HashTable::get( + const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream, + CommonFeatureValueAccessor& fv_accessor); template void HashTable::get(const long* d_keys, int* d_vals, size_t len, @@ -335,7 +337,8 @@ template void HashTable::get(const long* d_keys, template void HashTable::get( const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream); template void HashTable::get( - const unsigned long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream); + const unsigned long* d_keys, unsigned long* d_vals, size_t len, + cudaStream_t stream); template void HashTable::get( const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream); template void HashTable::get( @@ -350,15 +353,13 @@ template void HashTable::get( // const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t // stream); -template void HashTable::insert< - cudaStream_t>(const unsigned long* d_keys, - const float* d_vals, size_t len, - cudaStream_t stream); +template void HashTable::insert( + const unsigned long* d_keys, const float* d_vals, size_t len, + cudaStream_t stream); -template void HashTable:: - insert(const unsigned long* d_keys, size_t len, char* pool, - size_t feature_value_size, size_t start_index, - cudaStream_t stream); +template void HashTable::insert( + const unsigned long* d_keys, size_t len, char* pool, + size_t feature_value_size, size_t start_index, cudaStream_t stream); template void HashTable::insert(const long* d_keys, const int* d_vals, @@ -384,26 +385,27 @@ template void HashTable::insert( template void HashTable::insert( const long* d_keys, const unsigned int* d_vals, size_t len, cudaStream_t stream); - + template void HashTable::insert( const unsigned long* d_keys, const unsigned long* d_vals, size_t len, - cudaStream_t stream); + cudaStream_t stream); -template void HashTable:: - dump_to_cpu(int devid, cudaStream_t stream); +template void HashTable::dump_to_cpu( + int devid, cudaStream_t stream); template void -HashTable::update(const unsigned long* d_keys, const char* d_grads, size_t len, - SparseAdagradOptimizer sgd, - cudaStream_t stream); -template void -HashTable::update(const unsigned long* d_keys, const char* d_grads, size_t len, - SparseAdamOptimizer sgd, - cudaStream_t stream); +HashTable::update( + const unsigned long* d_keys, const char* d_grads, size_t len, + SparseAdagradOptimizer sgd, cudaStream_t stream); template void -HashTable::update(const unsigned long* d_keys, const char* d_grads, size_t len, - SparseAdamSharedOptimizer sgd, - cudaStream_t stream); +HashTable::update( + const unsigned long* d_keys, const char* d_grads, size_t len, + SparseAdamOptimizer sgd, cudaStream_t stream); +template void HashTable::update< + SparseAdamSharedOptimizer, cudaStream_t>(const unsigned long* d_keys, + const char* d_grads, size_t len, + SparseAdamSharedOptimizer sgd, + cudaStream_t stream); // template void HashTable::update< diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 45519d37165d22..956885334fa5f8 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -44,11 +44,12 @@ namespace framework { #define TYPEALIGN(ALIGNVAL, LEN) \ (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) -template +template class HeterComm { public: HeterComm(size_t capacity, std::shared_ptr resource); - HeterComm(size_t capacity, std::shared_ptr resource, + HeterComm(size_t capacity, std::shared_ptr resource, CommonFeatureValueAccessor& accessor); virtual ~HeterComm(); HeterComm(const HeterComm&) = delete; @@ -115,8 +116,11 @@ class HeterComm { max_mf_dim_ = max_mf_dim; } - void set_accessor(CommonFeatureValueAccessor& accessor) { - feature_value_accessor_ = accessor; + void set_accessor(FVAccessor& accessor) { + feature_value_accessor_ = accessor; + // for (auto& ptr_table: ptr_tables_) { + // ptr_table->set_accessor(feature_value_accessor_); + // } } #endif @@ -236,8 +240,8 @@ class HeterComm { void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, size_t val_size); + FVAccessor feature_value_accessor_; - CommonFeatureValueAccessor feature_value_accessor_; protected: using Table = HashTable; using PtrTable = HashTable; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 21b85acef9e14d..2108c0e23eae62 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -15,9 +15,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" #ifdef PADDLE_WITH_XPU_KP #include "paddle/fluid/platform/device/xpu/xpu_info.h" #endif @@ -27,8 +27,40 @@ DECLARE_bool(gpugraph_enable_gpu_direct_access); namespace paddle { namespace framework { -template -HeterComm::HeterComm( +// template +// HeterComm::HeterComm( +// size_t capacity, std::shared_ptr resource) { +// VLOG(1) << "Construct new HeterComm"; +// resource_ = resource; +// storage_.resize(resource_->total_device()); +// multi_mf_dim_ = resource->multi_mf(); +// load_factor_ = FLAGS_gpugraph_hbm_table_load_factor; +// VLOG(0) << "load_factor = " << load_factor_; +// for (int i = 0; i < resource_->total_device(); ++i) { +// #if defined(PADDLE_WITH_CUDA) +// platform::CUDADeviceGuard guard(resource_->dev_id(i)); +// allocators_.push_back(std::make_shared( +// 8, 1, (unsigned int)-1, (size_t)-1, false, false)); // NOLINT +// #endif +// if (!multi_mf_dim_) { +// auto table = new Table(capacity / load_factor_); +// tables_.push_back(table); +// } else { +// VLOG(0) << "Error:use HeterComm Construct with accessor"; +// return; +// } +// if (multi_node_) { +// storage_[i].init(feanum_, resource_->dev_id(i)); +// } +// } +// heter_comm_kernel_ = std::make_unique(block_size_); +// init_path(); +// } + +template +HeterComm::HeterComm( size_t capacity, std::shared_ptr resource) { VLOG(1) << "Construct new HeterComm"; resource_ = resource; @@ -41,48 +73,22 @@ HeterComm::HeterComm( platform::CUDADeviceGuard guard(resource_->dev_id(i)); allocators_.push_back(std::make_shared( 8, 1, (unsigned int)-1, (size_t)-1, false, false)); // NOLINT -#endif - if (!multi_mf_dim_) { - auto table = new Table(capacity / load_factor_); - tables_.push_back(table); - } else { - VLOG(0) << "Error:use HeterComm Construct with accessor"; - return; - } - if (multi_node_) { - storage_[i].init(feanum_, resource_->dev_id(i)); - } - } - heter_comm_kernel_ = std::make_unique(block_size_); - init_path(); -} - -template -HeterComm::HeterComm( - size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor& feature_value_accessor) { - VLOG(1) << "Construct new HeterComm"; - resource_ = resource; - storage_.resize(resource_->total_device()); - multi_mf_dim_ = resource->multi_mf(); - for (int i = 0; i < resource_->total_device(); ++i) { -#if defined(PADDLE_WITH_CUDA) - platform::CUDADeviceGuard guard(resource_->dev_id(i)); - allocators_.push_back(std::make_shared( - 8, 1, (unsigned int)-1, (size_t)-1, false, false)); // NOLINT #endif if (!multi_mf_dim_) { auto table = new Table(capacity / load_factor_); tables_.push_back(table); } else { max_mf_dim_ = resource_->max_mf_dim(); - feature_value_accessor_ = feature_value_accessor; - size_t val_type_size = TYPEALIGN(8, feature_value_accessor_.common_feature_value.Size(max_mf_dim_)); - size_t grad_type_size = TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); - VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t val_type_size = + accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + size_t grad_type_size = + accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); + VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size << ", feature_value_push_size:" << grad_type_size; auto ptr_table = new PtrTable(capacity / load_factor_); - ptr_table->set_accessor(feature_value_accessor_); + // ptr_table->set_accessor(feature_value_accessor_); ptr_table->set_feature_value_size(val_type_size, grad_type_size); ptr_tables_.push_back(ptr_table); } @@ -90,13 +96,15 @@ HeterComm::HeterComm( storage_[i].init(feanum_, resource_->dev_id(i)); } } - heter_comm_kernel_ = std::make_unique(block_size_, feature_value_accessor_); + // heter_comm_kernel_ = std::make_unique(block_size_, + // feature_value_accessor_); + heter_comm_kernel_ = std::make_unique(block_size_); init_path(); } - -template -void HeterComm::init_path() { +template +void HeterComm::init_path() { int total_device = resource_->total_device(); path_.resize(total_device); if (!topo_aware_) { @@ -148,9 +156,10 @@ void HeterComm::init_path() { } } -template +template template -void HeterComm::memory_copy( +void HeterComm::memory_copy( DstPlace dst_place, void* dst, SrcPlace src_place, const void* src, size_t count, StreamType stream) { #if defined(PADDLE_WITH_CUDA) @@ -163,11 +172,10 @@ void HeterComm::memory_copy( #endif } -template -void HeterComm::create_storage(int start_index, - int end_index, - size_t keylen, - size_t vallen) { +template +void HeterComm::create_storage( + int start_index, int end_index, size_t keylen, size_t vallen) { #if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; @@ -199,9 +207,10 @@ void HeterComm::create_storage(int start_index, #endif } -template -void HeterComm::destroy_storage(int start_index, - int end_index) { +template +void HeterComm::destroy_storage( + int start_index, int end_index) { #if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; @@ -216,12 +225,11 @@ void HeterComm::destroy_storage(int start_index, #endif } -template -void HeterComm::walk_to_dest(int start_index, - int num, int* h_left, - int* h_right, - KeyType* src_key, - GradType* src_val) { +template +void HeterComm::walk_to_dest( + int start_index, int num, int* h_left, int* h_right, KeyType* src_key, + GradType* src_val) { int need_copy_val = 0; if (src_val) { need_copy_val = 1; @@ -288,8 +296,9 @@ void HeterComm::walk_to_dest(int start_index, } } -template -void HeterComm::walk_to_dest( +template +void HeterComm::walk_to_dest( int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key, char* src_val, size_t val_size) { int need_copy_val = 0; @@ -305,43 +314,45 @@ void HeterComm::walk_to_dest( auto& node = path_[start_index][i].nodes_[0]; CopyTask t(&path_[start_index][i], 0); que.push(t); - CUDA_CHECK(cudaMemcpyAsync(node.key_storage, - reinterpret_cast(src_key + h_left[i]), - node.key_bytes_len, cudaMemcpyDefault, node.in_stream)); + CUDA_CHECK(cudaMemcpyAsync( + node.key_storage, reinterpret_cast(src_key + h_left[i]), + node.key_bytes_len, cudaMemcpyDefault, node.in_stream)); if (need_copy_val) { - CUDA_CHECK(cudaMemcpyAsync(node.val_storage, - src_val + uint64_t(h_left[i]) * uint64_t(val_size), - node.val_bytes_len, cudaMemcpyDefault, node.in_stream)); + CUDA_CHECK(cudaMemcpyAsync( + node.val_storage, src_val + uint64_t(h_left[i]) * uint64_t(val_size), + node.val_bytes_len, cudaMemcpyDefault, node.in_stream)); } } while (!que.empty()) { CopyTask& cur_task = que.front(); que.pop(); if (cur_task.path->nodes_[cur_task.step].sync) { - CUDA_CHECK(cudaStreamSynchronize(cur_task.path->nodes_[cur_task.step].in_stream)); + CUDA_CHECK(cudaStreamSynchronize( + cur_task.path->nodes_[cur_task.step].in_stream)); } if (cur_task.step != cur_task.path->nodes_.size() - 1) { int cur_step = cur_task.step; CopyTask c(cur_task.path, cur_step + 1); que.push(c); - CUDA_CHECK(cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage, - cur_task.path->nodes_[cur_step].key_storage, - cur_task.path->nodes_[cur_step + 1].key_bytes_len, - cudaMemcpyDefault, - cur_task.path->nodes_[cur_step + 1].in_stream)); + CUDA_CHECK(cudaMemcpyAsync( + cur_task.path->nodes_[cur_step + 1].key_storage, + cur_task.path->nodes_[cur_step].key_storage, + cur_task.path->nodes_[cur_step + 1].key_bytes_len, cudaMemcpyDefault, + cur_task.path->nodes_[cur_step + 1].in_stream)); if (need_copy_val) { - CUDA_CHECK(cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step + 1].val_bytes_len, - cudaMemcpyDefault, - cur_task.path->nodes_[cur_step + 1].in_stream)); + CUDA_CHECK(cudaMemcpyAsync( + cur_task.path->nodes_[cur_step + 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step + 1].val_bytes_len, + cudaMemcpyDefault, cur_task.path->nodes_[cur_step + 1].in_stream)); } } } } -template -void HeterComm::walk_to_src( +template +void HeterComm::walk_to_src( int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, size_t val_size) { std::queue que; @@ -353,16 +364,17 @@ void HeterComm::walk_to_src( auto& node = path_[start_index][i].nodes_[cur_step]; if (cur_step == 0) { CUDA_CHECK(cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size, - node.val_storage, node.val_bytes_len, cudaMemcpyDefault, - node.out_stream)); + node.val_storage, node.val_bytes_len, + cudaMemcpyDefault, node.out_stream)); } else { CopyTask t(&path_[start_index][i], cur_step - 1); que.push(t); - CUDA_CHECK(cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage, - node.val_storage, - path_[start_index][i].nodes_[cur_step - 1].val_bytes_len, - cudaMemcpyDefault, - path_[start_index][i].nodes_[cur_step - 1].out_stream)); + CUDA_CHECK(cudaMemcpyAsync( + path_[start_index][i].nodes_[cur_step - 1].val_storage, + node.val_storage, + path_[start_index][i].nodes_[cur_step - 1].val_bytes_len, + cudaMemcpyDefault, + path_[start_index][i].nodes_[cur_step - 1].out_stream)); } } while (!que.empty()) { @@ -375,24 +387,25 @@ void HeterComm::walk_to_src( if (cur_step > 0) { CopyTask c(cur_task.path, cur_step - 1); que.push(c); - CUDA_CHECK(cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step - 1].val_bytes_len, - cudaMemcpyDefault, - cur_task.path->nodes_[cur_step - 1].out_stream)); + CUDA_CHECK(cudaMemcpyAsync( + cur_task.path->nodes_[cur_step - 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step - 1].val_bytes_len, cudaMemcpyDefault, + cur_task.path->nodes_[cur_step - 1].out_stream)); } else if (cur_step == 0) { int end_index = cur_task.path->nodes_.back().dev_num; - CUDA_CHECK(cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step].val_bytes_len, - cudaMemcpyDefault, - cur_task.path->nodes_[cur_step].out_stream)); + CUDA_CHECK(cudaMemcpyAsync( + src_val + uint64_t(h_left[end_index]) * val_size, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step].val_bytes_len, cudaMemcpyDefault, + cur_task.path->nodes_[cur_step].out_stream)); } } } -template -HeterComm::~HeterComm() { +template +HeterComm::~HeterComm() { if (!multi_mf_dim_) { for (auto& table : tables_) { delete table; @@ -410,15 +423,19 @@ HeterComm::~HeterComm() { } } -template -void HeterComm::show_one_table(int gpu_num) { +template +void HeterComm::show_one_table( + int gpu_num) { if (!multi_mf_dim_) { tables_[gpu_num]->show(); } } -template -void HeterComm::show_table_collisions() { +template +void HeterComm::show_table_collisions() { size_t idx = 0; for (auto& table : tables_) { if (table != nullptr) { @@ -433,8 +450,9 @@ void HeterComm::show_table_collisions() { } } -template -int HeterComm::log2i(int x) { +template +int HeterComm::log2i(int x) { unsigned res = 0; while (x >>= 1) { ++res; @@ -442,13 +460,16 @@ int HeterComm::log2i(int x) { return res; } -template -int HeterComm::get_index_by_devid(int devid) { +template +int HeterComm::get_index_by_devid( + int devid) { return resource_->get_index_by_devid(devid); } -template -void HeterComm::set_sparse_sgd( +template +void HeterComm::set_sparse_sgd( const OptimizerConfig& optimizer_config) { for (int i = 0; i < resource_->total_device(); ++i) { AnyDeviceGuard guard(resource_->dev_id(i)); @@ -456,8 +477,9 @@ void HeterComm::set_sparse_sgd( } } -template -void HeterComm::set_embedx_sgd( +template +void HeterComm::set_embedx_sgd( const OptimizerConfig& optimizer_config) { for (int i = 0; i < resource_->total_device(); ++i) { AnyDeviceGuard guard(resource_->dev_id(i)); @@ -465,8 +487,9 @@ void HeterComm::set_embedx_sgd( } } -template -void HeterComm::build_ps( +template +void HeterComm::build_ps( int dev_num, KeyType* h_keys, ValType* h_vals, size_t len, size_t chunk_size, int stream_num, int offset) { if (len <= 0) { @@ -512,8 +535,8 @@ void HeterComm::build_ps( if (offset == -1) offset = dev_num; tables_[offset]->insert( reinterpret_cast(d_key_bufs[cur_stream]->ptr()), - reinterpret_cast(d_val_bufs[cur_stream]->ptr()), (size_t)tmp_len, - cur_use_stream); + reinterpret_cast(d_val_bufs[cur_stream]->ptr()), + (size_t)tmp_len, cur_use_stream); cur_stream += 1; cur_len += tmp_len; @@ -524,12 +547,11 @@ void HeterComm::build_ps( } } -template -void HeterComm::build_ps(int num, KeyType* h_keys, - char* pool, size_t len, - size_t feature_value_size, - size_t chunk_size, - int stream_num) { +template +void HeterComm::build_ps( + int num, KeyType* h_keys, char* pool, size_t len, size_t feature_value_size, + size_t chunk_size, int stream_num) { if (len <= 0) { return; } @@ -577,8 +599,9 @@ void HeterComm::build_ps(int num, KeyType* h_keys, } } -template -void HeterComm::merge_grad( +template +void HeterComm::merge_grad( int dev_num, KeyType* d_keys, GradType* d_grads, size_t len, int& uniq_len) { // NOLINT int dev_id = resource_->dev_id(dev_num); @@ -618,10 +641,10 @@ void HeterComm::merge_grad( sync_stream(stream); } -template -void HeterComm::dynamic_merge_grad( - int gpu_num, KeyType* d_keys, float* d_grads, size_t len, - int& uniq_len) { +template +void HeterComm::dynamic_merge_grad( + int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len) { int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); @@ -630,14 +653,15 @@ void HeterComm::dynamic_merge_grad( size_t temp_storage_bytes; size_t grad_dim = max_mf_dim_; - size_t grad_value_size = TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); auto d_merge_grads = memory::Alloc(place, len * grad_value_size); - float* d_merge_grads_ptr = - reinterpret_cast(d_merge_grads->ptr()); + float* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1)); uint32_t* d_fea_num_info_ptr = @@ -688,7 +712,8 @@ void HeterComm::dynamic_merge_grad( PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); heter_comm_kernel_->merge_gradient( d_keys, d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads, - (char*)d_merge_grads_ptr, uniq_len, grad_dim, grad_value_size, merger_, stream); + (char*)d_merge_grads_ptr, uniq_len, grad_dim, grad_value_size, merger_, + stream, feature_value_accessor_); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len, @@ -696,8 +721,9 @@ void HeterComm::dynamic_merge_grad( PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } -template -void HeterComm::split_input_to_shard( +template +void HeterComm::split_input_to_shard( KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right, int dev_num) { int total_device = resource_->total_device(); @@ -739,11 +765,10 @@ void HeterComm::split_input_to_shard( sync_stream(stream); } -template -void HeterComm::pull_sparse(int num, - KeyType* d_keys, - float* d_vals, - size_t len) { +template +void HeterComm::pull_sparse( + int num, KeyType* d_keys, float* d_vals, size_t len) { if (len == 0) { return; } @@ -786,7 +811,9 @@ void HeterComm::pull_sparse(int num, auto d_idx = memory::Alloc(place, len * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - size_t val_type_size = TYPEALIGN(8, feature_value_accessor_.common_feature_value.Size(max_mf_dim_)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t val_type_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); VLOG(3) << "pull_sparse len:" << len << " val_type_size: " << val_type_size; auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); @@ -832,12 +859,14 @@ void HeterComm::pull_sparse(int num, if (!FLAGS_gpugraph_enable_gpu_direct_access) { ptr_tables_[i]->get(reinterpret_cast(node.key_storage), node.val_storage, h_right[i] - h_left[i] + 1, - resource_->remote_stream(i, num)); + resource_->remote_stream(i, num), + feature_value_accessor_); } else { ptr_tables_[i]->get( d_shard_keys_ptr + h_left[i], reinterpret_cast(d_shard_vals_ptr) + h_left[i] * val_type_size, - h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num)); + h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num), + feature_value_accessor_); } } @@ -857,7 +886,8 @@ void HeterComm::pull_sparse(int num, } } heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len, - val_type_size, stream); + val_type_size, stream, + feature_value_accessor_); sync_stream(stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { for (int i = 0; i < total_device; ++i) { @@ -870,13 +900,12 @@ void HeterComm::pull_sparse(int num, } #if defined(PADDLE_WITH_CUDA) -template +template template -void HeterComm::push_sparse(int dev_num, - KeyType* d_keys, - float* d_grads, - size_t len, - Sgd& sgd) { // NOLINT +void HeterComm::push_sparse( + int dev_num, KeyType* d_keys, float* d_grads, size_t len, + Sgd& sgd) { // NOLINT if (len == 0) { return; } @@ -884,8 +913,9 @@ void HeterComm::push_sparse(int dev_num, int total_device = resource_->total_device(); int dev_id = resource_->dev_id(dev_num); - size_t grad_value_size = - TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); DevPlace place = DevPlace(dev_id); AnyDeviceGuard guard(dev_id); auto stream = resource_->local_stream(dev_num, 0); @@ -936,10 +966,10 @@ void HeterComm::push_sparse(int dev_num, split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); - + heter_comm_kernel_->dy_mf_fill_shard_grads( - d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, - uniq_len, grad_value_size, stream); + d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, uniq_len, + grad_value_size, stream, feature_value_accessor_); sync_stream(stream); @@ -957,11 +987,11 @@ void HeterComm::push_sparse(int dev_num, continue; } create_storage(dev_num, i, shard_len * sizeof(KeyType), - shard_len * grad_value_size); + shard_len * grad_value_size); } walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr, - reinterpret_cast(d_shard_grads_ptr), grad_value_size); + reinterpret_cast(d_shard_grads_ptr), grad_value_size); } for (int i = 0; i < total_device; ++i) { @@ -977,14 +1007,14 @@ void HeterComm::push_sparse(int dev_num, ptr_tables_[i]->rwlock_->WRLock(); if (!FLAGS_gpugraph_enable_gpu_direct_access) { ptr_tables_[i]->update(reinterpret_cast(node.key_storage), - node.val_storage, h_right[i] - h_left[i] + 1, - sgd, resource_->remote_stream(i, dev_num)); + node.val_storage, h_right[i] - h_left[i] + 1, sgd, + resource_->remote_stream(i, dev_num)); } else { ptr_tables_[i]->update(d_shard_keys_ptr + h_left[i], - reinterpret_cast(d_shard_grads_ptr) + - grad_value_size * h_left[i], - h_right[i] - h_left[i] + 1, sgd, - resource_->remote_stream(i, dev_num)); + reinterpret_cast(d_shard_grads_ptr) + + grad_value_size * h_left[i], + h_right[i] - h_left[i] + 1, sgd, + resource_->remote_stream(i, dev_num)); } } @@ -998,7 +1028,7 @@ void HeterComm::push_sparse(int dev_num, } } } - + if (!FLAGS_gpugraph_enable_gpu_direct_access) { for (int i = 0; i < total_device; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { @@ -1010,11 +1040,10 @@ void HeterComm::push_sparse(int dev_num, } #elif defined(PADDLE_WITH_XPU_KP) -template -void HeterComm::push_sparse(int dev_num, - KeyType* d_keys, - GradType* d_grads, - size_t len) { +template +void HeterComm::push_sparse( + int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) { if (len == 0) { return; } @@ -1130,9 +1159,10 @@ void HeterComm::push_sparse(int dev_num, #endif #if defined(PADDLE_WITH_CUDA) -template +template template -void HeterComm::update_one_table( +void HeterComm::update_one_table( int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, Sgd& sgd) { // NOLINT if (len == 0) { @@ -1148,9 +1178,10 @@ void HeterComm::update_one_table( cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num)); } -template +template template -void HeterComm::push_sparse_multi_node( +void HeterComm::push_sparse_multi_node( int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, Sgd& sgd) { // NOLINT if (len == 0) { @@ -1169,8 +1200,9 @@ void HeterComm::push_sparse_multi_node( storage_[gpu_num].local_grads, uniq_len, sgd); } -template -int HeterComm::gather_one_node_grad( +template +int HeterComm::gather_one_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int total_gpu = resource_->total_device(); int dev_id = resource_->dev_id(gpu_num); @@ -1256,8 +1288,9 @@ int HeterComm::gather_one_node_grad( return ret; } -template -int HeterComm::gather_multi_node_grad( +template +int HeterComm::gather_multi_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int dev_id = resource_->dev_id(gpu_num); auto& storage = storage_[gpu_num]; @@ -1317,8 +1350,9 @@ int HeterComm::gather_multi_node_grad( } #endif -template -void HeterComm::end_pass() { +template +void HeterComm::end_pass() { int total_device = resource_->total_device(); std::vector threads; @@ -1339,8 +1373,10 @@ void HeterComm::end_pass() { } } -// template -// void HeterComm::dump_to_cpu(int index) { +// template +// void HeterComm::dump_to_cpu(int +// index) { // auto stream = resource_->local_stream(index, 0); // int dev_id = resource_->dev_id(index); // platform::CUDADeviceGuard guard(dev_id); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index d7c6d65d4c4ef7..22c997b32fcaf1 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -117,30 +117,28 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals, } } -template +template __global__ void dy_mf_fill_shard_grads_kernel( KeyType* d_shard_keys, KeyType* d_keys, float* d_shard_grads, float* d_grads, T* idx, size_t len, size_t grad_value_size, - FVAceessor feature_value_accessor) { + FVAccessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { d_shard_keys[i] = d_keys[idx[i]]; float* cur = (float*)((char*)d_shard_grads + i * grad_value_size); - float* shard_val = (float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); + float* shard_val = + (float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); feature_value_accessor.PushValueFill(cur, shard_val); } } -template -__global__ void merge_gradients_basic_kernel(const KeyType* d_keys, - const uint32_t* offset, - const uint32_t* fea_num, - const uint32_t* index, const char* input, - char* output, int n, - size_t grad_value_size, - DynamicGradMerger& merger, - FVAceessor& feature_value_accessor) { +template +__global__ void merge_gradients_basic_kernel( + const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, + const uint32_t* index, const char* input, char* output, int n, + size_t grad_value_size, DynamicGradMerger& merger, + FVAccessor& feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { @@ -148,8 +146,7 @@ __global__ void merge_gradients_basic_kernel(const KeyType* d_keys, uint32_t num = fea_num[i]; int ori_index = index[start]; float* out = (float*)(output + i * grad_value_size); - float* in = - (float*)(input + size_t(ori_index) * grad_value_size); + float* in = (float*)(input + size_t(ori_index) * grad_value_size); merger.update_basic(out, in, feature_value_accessor); KeyType key = d_keys[i]; if (key != 0) { @@ -162,16 +159,12 @@ __global__ void merge_gradients_basic_kernel(const KeyType* d_keys, } } -template -__global__ void merge_gradients_embedx_kernel(const KeyType* d_keys, - const uint32_t* offset, - const uint32_t* fea_num, - const uint32_t* index, const char* input, - char* output, int n, - size_t grad_dim, - size_t grad_value_size, - DynamicGradMerger& merger, - FVAceessor& feature_value_accessor) { +template +__global__ void merge_gradients_embedx_kernel( + const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, + const uint32_t* index, const char* input, char* output, int n, + size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger, + FVAccessor& feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { @@ -194,16 +187,17 @@ __global__ void merge_gradients_embedx_kernel(const KeyType* d_keys, } } -template +template __global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals, float* d_vals, T* idx, size_t len, size_t val_size, - FVAceessor feature_value_accessor) { + FVAccessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { uint64_t new_offset = uint64_t(idx[i]) * val_size; float* cur = (float*)((char*)d_vals + new_offset); float* shard_val = (float*)((char*)d_shard_vals + uint64_t(i) * val_size); - int mf_dim = int(shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]); + int mf_dim = int( + shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]); feature_value_accessor.FeatureValueFill(cur, shard_val, mf_dim); } @@ -299,44 +293,46 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage, debug_synchronous)); } -template +template void HeterCommKernel::dy_mf_fill_shard_grads( KeyType* d_shard_keys, KeyType* d_keys, float* d_shard_grads, float* d_grads, T* idx, long long len, size_t grad_value_size, - const StreamType& stream) { + const StreamType& stream, FVAccessor& feature_value_accessor) { int grid_size = (len - 1) / block_size_ + 1; size_t c_len = (size_t)len; dy_mf_fill_shard_grads_kernel<<>>( - d_shard_keys, d_keys, d_shard_grads, d_grads, idx, c_len, - grad_value_size, feature_value_accessor_); + d_shard_keys, d_keys, d_shard_grads, d_grads, idx, c_len, grad_value_size, + feature_value_accessor); } -template +template void HeterCommKernel::merge_gradient( - const KeyType* d_keys, - const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, - const char* input, char* output, int n, size_t grad_dim, size_t grad_value_size, - DynamicGradMerger& merger, const StreamType& stream) { + const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, + const uint32_t* index, const char* input, char* output, int n, + size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger, + const StreamType& stream, FVAccessor& feature_value_accessor) { int grid_size1 = (n - 1) / block_size_ + 1; merge_gradients_basic_kernel<<>>( - d_keys, - offset, fea_num, index, input, output, n, grad_value_size, merger, feature_value_accessor_); + d_keys, offset, fea_num, index, input, output, n, grad_value_size, merger, + feature_value_accessor); if (grad_dim > 0) { int grid_size2 = (n * grad_dim - 1) / block_size_ + 1; merge_gradients_embedx_kernel<<>>( - d_keys, - offset, fea_num, index, input, output, n * grad_dim, grad_dim, grad_value_size, merger, feature_value_accessor_); + d_keys, offset, fea_num, index, input, output, n * grad_dim, grad_dim, + grad_value_size, merger, feature_value_accessor); } } -template +template void HeterCommKernel::dy_mf_fill_dvals(float* d_shard_vals, float* d_vals, T* idx, long long len, size_t val_size, - const StreamType& stream) { + const StreamType& stream, + FVAccessor& feature_value_accessor) { int grid_size = (len - 1) / block_size_ + 1; size_t c_len = (size_t)len; dy_mf_fill_dvals_kernel<<>>( - d_shard_vals, d_vals, idx, c_len, val_size, feature_value_accessor_); + d_shard_vals, d_vals, idx, c_len, val_size, feature_value_accessor); } template void HeterCommKernel::fill_idx( @@ -364,12 +360,10 @@ template void HeterCommKernel::fill_shard_key( unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len, const cudaStream_t& stream); -template void HeterCommKernel::fill_shard_grads< - unsigned long, float, int, cudaStream_t>( - unsigned long* d_shard_keys, unsigned long* d_keys, - float* d_shard_grads, - float* d_grads, int* idx, long long len, - const cudaStream_t& stream); +template void +HeterCommKernel::fill_shard_grads( + unsigned long* d_shard_keys, unsigned long* d_keys, float* d_shard_grads, + float* d_grads, int* idx, long long len, const cudaStream_t& stream); template void HeterCommKernel::fill_dvals( @@ -405,27 +399,33 @@ template void HeterCommKernel::reduce_by_key< int num_items, cudaStream_t stream, bool debug_synchronous); template void HeterCommKernel::dy_mf_fill_shard_grads< - unsigned long, int, cudaStream_t>( - unsigned long* d_shard_keys, unsigned long* d_keys, - float* d_shard_grads, float* d_grads, int* idx, long long len, - size_t grad_value_size, const cudaStream_t& stream); - -template void HeterCommKernel::merge_gradient( - const uint32_t* d_keys, - const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, - const char* input, char* output, int n, size_t grad_dim, size_t grad_value_size, - DynamicGradMerger& merger_, const cudaStream_t& stream); - -template void HeterCommKernel::merge_gradient( - const uint64_t* d_keys, - const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, - const char* input, char* output, int n, size_t grad_dim, size_t grad_value_size, - DynamicGradMerger& merger_, const cudaStream_t& stream); - -template void HeterCommKernel::dy_mf_fill_dvals( - float* d_shard_vals, - float* d_vals, int* idx, long long len, - size_t val_size, const cudaStream_t& stream); + unsigned long, int, cudaStream_t, CommonFeatureValueAccessor>( + unsigned long* d_shard_keys, unsigned long* d_keys, float* d_shard_grads, + float* d_grads, int* idx, long long len, size_t grad_value_size, + const cudaStream_t& stream, + CommonFeatureValueAccessor& feature_value_accessor); + +template void HeterCommKernel::merge_gradient( + const uint32_t* d_keys, const uint32_t* offset, const uint32_t* fea_num, + const uint32_t* index, const char* input, char* output, int n, + size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_, + const cudaStream_t& stream, + CommonFeatureValueAccessor& feature_value_accessor); + +template void HeterCommKernel::merge_gradient( + const uint64_t* d_keys, const uint32_t* offset, const uint32_t* fea_num, + const uint32_t* index, const char* input, char* output, int n, + size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_, + const cudaStream_t& stream, + CommonFeatureValueAccessor& feature_value_accessor); + +template void HeterCommKernel::dy_mf_fill_dvals( + float* d_shard_vals, float* d_vals, int* idx, long long len, + size_t val_size, const cudaStream_t& stream, + CommonFeatureValueAccessor& feature_value_accessor); #endif } // namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index 473b16bbe48ecb..5dc11b86ab63d2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -41,37 +41,42 @@ struct DynamicGradMerger { return out; } - __device__ __forceinline__ void update_one(float* output, const float* input, - CommonFeatureValueAccessor& feature_value_accessor) { + __device__ __forceinline__ void update_one( + float* output, const float* input, + CommonFeatureValueAccessor& feature_value_accessor) { feature_value_accessor.PushValueFill(output, input); } - __device__ __forceinline__ void merge_one(float* output, const float* input, - CommonFeatureValueAccessor& feature_value_accessor) { + __device__ __forceinline__ void merge_one( + float* output, const float* input, + CommonFeatureValueAccessor& feature_value_accessor) { feature_value_accessor.MergePushValue(output, input); - } - __device__ __forceinline__ void update_basic(float* output, const float* input, - CommonFeatureValueAccessor& fv_accessor) { + __device__ __forceinline__ void update_basic( + float* output, const float* input, + CommonFeatureValueAccessor& fv_accessor) { fv_accessor.PushValueFillBasic(output, input); } - __device__ __forceinline__ void merge_basic(float* output, const float* input, - CommonFeatureValueAccessor& fv_accessor) { + __device__ __forceinline__ void merge_basic( + float* output, const float* input, + CommonFeatureValueAccessor& fv_accessor) { fv_accessor.MergePushValueBasic(output, input); } - __device__ __forceinline__ void update_embedx(float* output, const float* input, size_t embedx_idx, - CommonFeatureValueAccessor& fv_accessor) { + __device__ __forceinline__ void update_embedx( + float* output, const float* input, size_t embedx_idx, + CommonFeatureValueAccessor& fv_accessor) { if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) { output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] = input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx]; } } - __device__ __forceinline__ void merge_embedx(float* output, const float* input, size_t embedx_idx, - CommonFeatureValueAccessor& fv_accessor) { + __device__ __forceinline__ void merge_embedx( + float* output, const float* input, size_t embedx_idx, + CommonFeatureValueAccessor& fv_accessor) { if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) { output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] += input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx]; @@ -84,7 +89,10 @@ class HeterCommKernel { HeterCommKernel() {} explicit HeterCommKernel(const int block_size) : block_size_(block_size) {} - explicit HeterCommKernel(const int block_size, CommonFeatureValueAccessor& feature_value_accessor) : block_size_(block_size), feature_value_accessor_(feature_value_accessor) {} + // explicit HeterCommKernel(const int block_size, CommonFeatureValueAccessor& + // feature_value_accessor) : block_size_(block_size), + // feature_value_accessor_(feature_value_accessor) {} + // explicit HeterCommKernel(const int block_size) : block_size_(block_size) {} template void fill_idx(T* idx, long long len, const StreamType& stream); @@ -134,24 +142,29 @@ class HeterCommKernel { StreamType stream = NULL, bool debug_synchronous = false); - template + template void dy_mf_fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, - float* d_shard_grads, float* d_grads, - T* idx, long long len, size_t grad_value_size, - const StreamType& stream); - - template - void merge_gradient(const KeyType* d_shard_keys, const uint32_t* offset, const uint32_t* fea_num, - const uint32_t* index, const char* input, char* output, - int n, size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger, - const StreamType& stream); - - template + float* d_shard_grads, float* d_grads, T* idx, + long long len, size_t grad_value_size, + const StreamType& stream, + FVAccessor& feature_value_accessor); + + template + void merge_gradient(const KeyType* d_shard_keys, const uint32_t* offset, + const uint32_t* fea_num, const uint32_t* index, + const char* input, char* output, int n, size_t grad_dim, + size_t grad_value_size, DynamicGradMerger& merger, + const StreamType& stream, + FVAccessor& feature_value_accessor); + + template void dy_mf_fill_dvals(float* d_shard_vals, float* d_vals, T* idx, long long len, size_t val_size, - const StreamType& stream); + const StreamType& stream, + FVAccessor& feature_value_accessor); - CommonFeatureValueAccessor feature_value_accessor_; + // CommonFeatureValueAccessor feature_value_accessor_; private: int block_size_{256}; }; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc index 1ec006f580c96c..855842644abf72 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc @@ -22,18 +22,26 @@ namespace framework { HeterPsBase* HeterPsBase::get_instance( size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, - int optimizer_type) { - return new HeterPs(capacity, resource, feature_value_accessor, optimizer_type); + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type) { + if (accessor_type == "CtrDymfAccessor" && + (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { + return new HeterPs( + capacity, resource, accessor_type, fleet_config, optimizer_type); + } else { + VLOG(0) << " HeterPsBase get_instance Warning: now only support " + "CtrDymfAccessor, but get " + << accessor_type_; + return new HeterPs( + capacity, resource, accessor_type, fleet_config, optimizer_type); + } } -HeterPs::HeterPs(size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, - int optimizer_type) { - comm_ = - std::make_shared>( - capacity, resource); - feature_value_accessor_ = feature_value_accessor; +HeterPs::HeterPs(size_t capacity, std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type) { + comm_ = std::make_shared>( + capacity, resource); optimizer_type_ = optimizer_type; } @@ -60,8 +68,8 @@ void HeterPs::end_pass() { comm_->end_pass(); } void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } -void HeterPs::push_sparse(int num, FeatureKey* d_keys, - float* d_grads, size_t len) { +void HeterPs::push_sparse(int num, FeatureKey* d_keys, float* d_grads, + size_t len) { comm_->push_sparse(num, d_keys, d_grads, len); // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 037ec1415c7bde..1f06343d40087b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -22,83 +22,127 @@ namespace framework { HeterPsBase* HeterPsBase::get_instance( size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, - int optimizer_type) { - return new HeterPs(capacity, resource, feature_value_accessor, optimizer_type); + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type) { + if (accessor_type == "CtrDymfAccessor" && + (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { + return new HeterPs( + capacity, resource, fleet_config, accessor_type, optimizer_type); + } else { + VLOG(0) << " HeterPsBase get_instance Warning: now only support " + "CtrDymfAccessor, but get " + << accessor_type; + return new HeterPs( + capacity, resource, fleet_config, accessor_type, optimizer_type); + } } -HeterPs::HeterPs(size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, - int optimizer_type) { - comm_ = - std::make_shared>( - capacity, resource, feature_value_accessor); - feature_value_accessor_ = feature_value_accessor; +template +HeterPs::HeterPs( + size_t capacity, std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type) { + comm_ = std::make_shared>( + capacity, resource); + feature_value_accessor_.Configure(fleet_config); + set_accessor(feature_value_accessor_); + accessor_type_ = accessor_type; optimizer_type_ = optimizer_type; } -HeterPs::~HeterPs() {} +template +HeterPs::~HeterPs() {} -void HeterPs::pull_sparse(int num, FeatureKey* d_keys, float* d_vals, - size_t len) { +template +void HeterPs::pull_sparse(int num, FeatureKey* d_keys, + float* d_vals, size_t len) { comm_->pull_sparse(num, d_keys, d_vals, len); } -void HeterPs::build_ps(int num, FeatureKey* h_keys, char* pool, size_t len, - size_t feature_value_size, size_t chunk_size, - int stream_num) { +template +void HeterPs::build_ps(int num, FeatureKey* h_keys, char* pool, + size_t len, size_t feature_value_size, + size_t chunk_size, int stream_num) { comm_->build_ps(num, h_keys, pool, len, feature_value_size, chunk_size, stream_num); } -int HeterPs::get_index_by_devid(int devid) { +template +int HeterPs::get_index_by_devid(int devid) { return comm_->get_index_by_devid(devid); } -void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) { +template +void HeterPs::set_sparse_sgd( + const OptimizerConfig& optimizer_config) { comm_->set_sparse_sgd(optimizer_config); } -void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) { +template +void HeterPs::set_embedx_sgd( + const OptimizerConfig& optimizer_config) { comm_->set_embedx_sgd(optimizer_config); } -void HeterPs::end_pass() { comm_->end_pass(); } +template +void HeterPs::end_pass() { + comm_->end_pass(); +} -void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } +template +void HeterPs::show_one_table(int gpu_num) { + comm_->show_one_table(gpu_num); +} -void HeterPs::push_sparse(int num, FeatureKey* d_keys, - float* d_grads, size_t len) { - if (optimizer_type_ == 3) { //adam - auto optimizer = SparseAdamOptimizer(feature_value_accessor_); - VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():" << optimizer.EmbedDim(); - comm_->push_sparse(num, d_keys, d_grads, len, optimizer); - } else if (optimizer_type_ == 4) { //shared_adam - auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_); - VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():" << optimizer.EmbedDim(); - comm_->push_sparse(num, d_keys, d_grads, len, optimizer); +template +void HeterPs::push_sparse(int num, FeatureKey* d_keys, + float* d_grads, size_t len) { + if (accessor_type_ == "CtrDymfAccessor") { + if (optimizer_type_ == 3) { // adam + auto optimizer = SparseAdamOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else if (optimizer_type_ == 4) { // shared_adam + auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else if (optimizer_type_ == 1) { // adagrad { + auto optimizer = SparseAdagradOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else { + VLOG(0) << " push sparse Error: CtrDymfAccessor only support adagrad(1)," + "adam(3) or shared_adam(4), bug get optimizer type:" + << optimizer_type_; + } } else { - auto optimizer = SparseAdagradOptimizer(feature_value_accessor_); - VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():" << optimizer.EmbedDim(); - comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + VLOG(0) << " push sparse Error: now only support CtrDymfAccessor, but get " + << accessor_type_; } } -void HeterPs::set_nccl_comm_and_size(const std::vector& inner_comms, - const std::vector& inter_comms, - int comm_size) { +template +void HeterPs::set_nccl_comm_and_size( + const std::vector& inner_comms, + const std::vector& inter_comms, int comm_size) { comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); } -void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { +template +void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim); } -void HeterPs::set_accessor(CommonFeatureValueAccessor& accessor) { +template +void HeterPs::set_accessor(FVAccessor& accessor) { comm_->set_accessor(accessor); } -void HeterPs::show_table_collisions() { +template +void HeterPs::show_table_collisions() { comm_->show_table_collisions(); } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 89ec93f63db1cf..db0253500ffb6a 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -25,12 +25,13 @@ limitations under the License. */ namespace paddle { namespace framework { +template class HeterPs : public HeterPsBase { public: HeterPs() {} HeterPs(size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, - int optimizer_type); + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type); virtual ~HeterPs(); HeterPs(const HeterPs&) = delete; HeterPs& operator=(const HeterPs&) = delete; @@ -47,7 +48,8 @@ class HeterPs : public HeterPsBase { const std::vector& inter_comms, int comm_size) override; void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; - void set_accessor(CommonFeatureValueAccessor& accessor) override; + + void set_accessor(FVAccessor& accessor); #endif void set_sparse_sgd(const OptimizerConfig& optimizer_config) override; @@ -56,14 +58,14 @@ class HeterPs : public HeterPsBase { void end_pass() override; int get_index_by_devid(int devid) override; void show_one_table(int gpu_num) override; - void push_sparse(int num, FeatureKey* d_keys, float* d_grads, - size_t len); + void push_sparse(int num, FeatureKey* d_keys, float* d_grads, size_t len); void show_table_collisions() override; private: - std::shared_ptr> comm_; + std::shared_ptr> comm_; #if defined(PADDLE_WITH_CUDA) - CommonFeatureValueAccessor feature_value_accessor_; + FVAccessor feature_value_accessor_; + std::string accessor_type_; int optimizer_type_; #endif }; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index aa74335b1a5e4c..e285921274af95 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -42,22 +42,22 @@ class HeterPsBase { const std::vector& inner_comms, const std::vector& inter_comms, int comm_size) = 0; virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0; - virtual void set_accessor(CommonFeatureValueAccessor& accessor) = 0; #endif virtual void end_pass() = 0; virtual void show_one_table(int gpu_num) = 0; virtual void show_table_collisions() = 0; - virtual void push_sparse(int num, FeatureKey* d_keys, - float* d_grads, size_t len) = 0; + virtual void push_sparse(int num, FeatureKey* d_keys, float* d_grads, + size_t len) = 0; virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0; virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0; - static HeterPsBase* get_instance(size_t capacity, - std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, - int optimizer_type); + static HeterPsBase* get_instance( + size_t capacity, std::shared_ptr resource, + // CommonFeatureValueAccessor feature_value_accessor, + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type); }; } // end namespace framework diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index f7f89450158ef5..1283be92e19e51 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -33,8 +33,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/platform/timer.h" #if defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #endif @@ -582,102 +582,112 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { return; } std::vector threads(device_num); - HeterPs_ = HeterPsBase::get_instance(size_max, resource_, feature_value_accessor_, optimizer_type_); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + HeterPs_ = HeterPsBase::get_instance(size_max, resource_, fleet_config_, + accessor_class_, optimizer_type_); #ifdef PADDLE_WITH_CUDA HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_); HeterPs_->set_sparse_sgd(optimizer_config_); HeterPs_->set_embedx_sgd(optimizer_config_); #endif - auto build_dynamic_mf_func = [this, &gpu_task](int i, int j) { - this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); - // this->HeterPs_->set_accessor(feature_value_accessor_); - int mf_dim = this->index_dim_vec_[j]; - VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim - << " feature_value_dim:" << feature_value_accessor_.common_feature_value.Dim(mf_dim) - << " feature_value_size:" << feature_value_accessor_.common_feature_value.Size(mf_dim); - size_t feature_value_size = - TYPEALIGN(8, feature_value_accessor_.common_feature_value.Size(mf_dim)); - auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; - auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; - size_t len = device_dim_keys.size(); - CHECK(len == device_dim_ptrs.size()); - this->mem_pools_[i * this->multi_mf_dim_ + j] = - new MemoryPool(len, feature_value_size); - auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; - for (size_t k = 0; k < len; k++) { - float* val = (float*)(mem_pool->mem_address(k)); - float* ptr_val = device_dim_ptrs[k]->data(); - size_t dim = device_dim_ptrs[k]->size(); + auto build_dynamic_mf_func = + [this, &gpu_task, &accessor_wrapper_ptr](int i, int j) { + this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); + int mf_dim = this->index_dim_vec_[j]; + VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim + << " feature_value_size:" + << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; + size_t len = device_dim_keys.size(); + CHECK(len == device_dim_ptrs.size()); + this->mem_pools_[i * this->multi_mf_dim_ + j] = + new MemoryPool(len, feature_value_size); + auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; + for (size_t k = 0; k < len; k++) { + // float* val = (float*)(mem_pool->mem_address(k)); + void* val = mem_pool->mem_address(k); + float* ptr_val = device_dim_ptrs[k]->data(); + size_t dim = device_dim_ptrs[k]->size(); #ifdef PADDLE_WITH_PSLIB - val->delta_score = + val->delta_score = + ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::delta_score_index()]; + val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::show_index()]; + val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::click_index()]; + val->slot = + int(ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()]); + val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_w_index()]; + val->lr_g2sum = + ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_g2sum_index()]; + // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::delta_score_index()]; - val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::show_index()]; - val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::click_index()]; - val->slot = int(ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::slot_index()]); - val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::embed_w_index()]; - val->lr_g2sum = - ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::embed_g2sum_index()]; - // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor - ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: - mf_dim_index()] = float(mf_dim); - val->mf_dim = mf_dim; - if (dim > 8) { // CpuPS alreay expand as mf_dim - val->mf_size = mf_dim + 1; - for (int x = 0; x < val->mf_dim + 1; x++) { - val->mf[x] = ptr_val[x + 8]; - } - } else { - val->mf_size = 0; - for (int x = 0; x < val->mf_dim + 1; x++) { - val->mf[x] = 0; + DownpourCtrDymfFeatureValue::mf_dim_index()] = + float(mf_dim); + val->mf_dim = mf_dim; + if (dim > 8) { // CpuPS alreay expand as mf_dim + val->mf_size = mf_dim + 1; + for (int x = 0; x < val->mf_dim + 1; x++) { + val->mf[x] = ptr_val[x + 8]; + } + } else { + val->mf_size = 0; + for (int x = 0; x < val->mf_dim + 1; x++) { + val->mf[x] = 0; + } + } } - } - } #endif #ifdef PADDLE_WITH_PSCORE - VLOG(5) << "cpu build "<< k << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) - << " |: "<< cpu_table_accessor_->ParseToString(ptr_val, dim); - feature_value_accessor_.BuildFill(val, ptr_val, cpu_table_accessor_, mf_dim, dim); - *(reinterpret_cast(val + feature_value_accessor_.common_feature_value.CpuPtrIndex())) = (uint64_t)(device_dim_ptrs[k]); - VLOG(5) << "build "<< k << " : "<< feature_value_accessor_.ParseToString(val, feature_value_accessor_.common_feature_value.Dim(mf_dim)); - } + VLOG(5) << "cpu build " << k + << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) + << " |: " << cpu_table_accessor_->ParseToString(ptr_val, dim); + accessor_wrapper_ptr->BuildFill(val, device_dim_ptrs[k], + cpu_table_accessor_, mf_dim); + VLOG(5) << "build " << k << " : " + << accessor_wrapper_ptr->ParseToString( + (float*)(val), + int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / + sizeof(float))); + } #endif - platform::CUDADeviceGuard guard(resource_->dev_id(i)); + platform::CUDADeviceGuard guard(resource_->dev_id(i)); - this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool); - auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; + this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool); + auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - this->HeterPs_->build_ps(i, device_dim_keys.data(), cur_pool->mem(), len, - feature_value_size, 500000, 2); + this->HeterPs_->build_ps(i, device_dim_keys.data(), cur_pool->mem(), len, + feature_value_size, 500000, 2); - if (device_dim_keys.size() > 0) { - VLOG(0) << "show ptr table: " << i - << " table kv size: " << device_dim_keys.size() - << "dim: " << mf_dim << " len: " << len; - this->HeterPs_->show_one_table(i); - } - delete mem_pool; - }; - threads.resize(device_num * multi_mf_dim_); - for (int i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j); - } + if (device_dim_keys.size() > 0) { + VLOG(0) << "show ptr table: " << i + << " table kv size: " << device_dim_keys.size() << "dim: " << mf_dim + << " len: " << len; + this->HeterPs_->show_one_table(i); } - - for (std::thread& t : threads) { - t.join(); + delete mem_pool; +}; +threads.resize(device_num* multi_mf_dim_); +for (int i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j); } - timeline.Pause(); - VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() - << " s."; +} + +for (std::thread& t : threads) { + t.join(); +} +timeline.Pause(); +VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() << " s."; } void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { @@ -787,101 +797,109 @@ void PSGPUWrapper::EndPass() { } } - auto dump_pool_to_cpu_func = [this](int i, int j) { - PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); - auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - auto& device_keys = this->current_task_->device_dim_keys_[i][j]; - size_t len = device_keys.size(); - int mf_dim = this->index_dim_vec_[j]; - size_t feature_value_size = - TYPEALIGN(8, feature_value_accessor_.common_feature_value.Size(mf_dim)); - VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim - << " key_len :" << len << " feature_value_size:" << feature_value_size; - - char* test_build_values = (char*)malloc(feature_value_size * len); - cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len, - cudaMemcpyDeviceToHost); - - CHECK(len == hbm_pool->capacity()); - uint64_t unuse_key = std::numeric_limits::max(); - for (size_t index = 0; index < len; ++index) { - if (device_keys[index] == unuse_key) { - continue; - } - size_t offset = index * feature_value_size; - float* gpu_val = (float*)(test_build_values + offset); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + auto dump_pool_to_cpu_func = + [this, &accessor_wrapper_ptr](int i, int j) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); + auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; + auto& device_keys = this->current_task_->device_dim_keys_[i][j]; + size_t len = device_keys.size(); + int mf_dim = this->index_dim_vec_[j]; + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim + << " key_len :" << len + << " feature_value_size:" << feature_value_size; + + char* test_build_values = (char*)malloc(feature_value_size * len); + cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len, + cudaMemcpyDeviceToHost); + + CHECK(len == hbm_pool->capacity()); + uint64_t unuse_key = std::numeric_limits::max(); + for (size_t index = 0; index < len; ++index) { + if (device_keys[index] == unuse_key) { + continue; + } + size_t offset = index * feature_value_size; + float* gpu_val = (float*)(test_build_values + offset); #ifdef PADDLE_WITH_PSLIB - auto* downpour_value = - (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val->mf_size > 0 && downpour_value_size == 8) { - downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: - delta_score_index()] = gpu_val->delta_score; - cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: - show_index()] = gpu_val->show; - cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: - click_index()] = gpu_val->clk; - cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: - embed_w_index()] = gpu_val->lr; - cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: - embed_g2sum_index()] = gpu_val->lr_g2sum; - cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: - slot_index()] = gpu_val->slot; - - if (gpu_val->mf_size > 0) { - for (int x = 0; x < gpu_val->mf_dim + 1; x++) { - cpu_val[x + 8] = gpu_val->mf[x]; + auto* downpour_value = + (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr); + int downpour_value_size = downpour_value->size(); + if (gpu_val->mf_size > 0 && downpour_value_size == 8) { + downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size); + } + float* cpu_val = downpour_value->data(); + cpu_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::delta_score_index()] = + gpu_val->delta_score; + cpu_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::show_index()] = + gpu_val->show; + cpu_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::click_index()] = + gpu_val->clk; + cpu_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_w_index()] = + gpu_val->lr; + cpu_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_g2sum_index()] = + gpu_val->lr_g2sum; + cpu_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()] = + gpu_val->slot; + + if (gpu_val->mf_size > 0) { + for (int x = 0; x < gpu_val->mf_dim + 1; x++) { + cpu_val[x + 8] = gpu_val->mf[x]; + } + } } - } - } #endif #ifdef PADDLE_WITH_PSCORE - auto* downpour_value = - (paddle::distributed::FixedFeatureValue*)(*(reinterpret_cast(gpu_val+ feature_value_accessor_.common_feature_value.CpuPtrIndex()))); - size_t downpour_value_size = downpour_value->size(); - if (gpu_val[feature_value_accessor_.common_feature_value.MfSizeIndex()] > 0 && - downpour_value_size == (cpu_table_accessor_->GetAccessorInfo().dim - - int(cpu_table_accessor_->GetAccessorInfo().mf_size / sizeof(float)))) { // cpu_accessor - downpour_value->resize(cpu_table_accessor_->common_feature_value.Dim(mf_dim)); + accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); + auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*( + reinterpret_cast(gpu_val))); + float* cpu_val = downpour_value->data(); + VLOG(5) << "dump to cpu " << index << " gpu_value: " + << accessor_wrapper_ptr->ParseToString( + gpu_val, + int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / + sizeof(float))) + << " \t cpu_value:" + << cpu_table_accessor_->ParseToString(cpu_val, + downpour_value->size()); } - float* cpu_val = downpour_value->data(); - - feature_value_accessor_.DumpFill(cpu_val, gpu_val, cpu_table_accessor_, mf_dim); - VLOG(5) << "dump to cpu "<< index << " : "<< feature_value_accessor_.ParseToString(gpu_val, feature_value_accessor_.common_feature_value.Dim(mf_dim)) - << " ===== CPU:" << cpu_table_accessor_->ParseToString(cpu_val, downpour_value->size()); - - } #endif - free(test_build_values); - }; - if (multi_mf_dim_) { - VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; - size_t device_num = heter_devices_.size(); - std::vector threads(device_num * multi_mf_dim_); - for (size_t i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); - } - } - for (std::thread& t : threads) { - t.join(); + free(test_build_values); +}; +if (multi_mf_dim_) { + VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; + size_t device_num = heter_devices_.size(); + std::vector threads(device_num * multi_mf_dim_); + for (size_t i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); } } - if (keysize_max != 0) { - HeterPs_->end_pass(); - } - VLOG(0) << "HeterPs_->end_pass end"; - for (size_t i = 0; i < hbm_pools_.size(); i++) { - delete hbm_pools_[i]; + for (std::thread& t : threads) { + t.join(); } - gpu_task_pool_.Push(current_task_); - current_task_ = nullptr; - gpu_free_channel_->Put(current_task_); - timer.Pause(); - VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; +} +if (keysize_max != 0) { + HeterPs_->end_pass(); +} +VLOG(0) << "HeterPs_->end_pass end"; +for (size_t i = 0; i < hbm_pools_.size(); i++) { + delete hbm_pools_[i]; +} +gpu_task_pool_.Push(current_task_); +current_task_ = nullptr; +gpu_free_channel_->Put(current_task_); +timer.Pause(); +VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, @@ -890,7 +908,8 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, const std::vector& values, const std::vector& slot_lengths, const int hidden_size) { - VLOG(0) << "Warning:: recommand use pull_gpups_sparse op instead. This PullSparse is not used."; + VLOG(0) << "Warning:: recommand use pull_gpups_sparse op instead. This " + "PullSparse is not used."; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, @@ -908,9 +927,12 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); size_t feature_value_size = 0; - feature_value_size = TYPEALIGN(8, feature_value_accessor_.common_feature_value.Size(max_mf_dim_)); - VLOG(3) << "PullSparse max_dim:" << max_mf_dim_ << " feature_value_size:" << feature_value_size; - + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + VLOG(3) << "PullSparse max_dim:" << max_mf_dim_ + << " feature_value_size:" << feature_value_size; + #ifdef PADDLE_WITH_CUDA VLOG(3) << "Begine Gpu Ps PullSparse"; auto buf = memory::Alloc(place, total_length * feature_value_size); @@ -967,9 +989,10 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length << "]"; - this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len, - static_cast(slot_lengths.size()), hidden_size, - total_length, gpu_dim); + accessor_wrapper_ptr->CopyForPull( + place, gpu_keys, values, total_values_gpu, gpu_len, + static_cast(slot_lengths.size()), hidden_size, total_length, + gpu_dim, val_type_size_); pull_gpups_timer.Pause(); @@ -1013,9 +1036,10 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length << "]"; - this->CopyForPull(place, xpu_keys, values, total_values_gpu, xpu_len, - static_cast(slot_lengths.size()), hidden_size, - total_length); + accessor_wrapper_ptr->CopyForPull( + place, xpu_keys, values, total_values_gpu, xpu_len, + static_cast(slot_lengths.size()), hidden_size, total_length, + val_type_size_); #endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -1041,12 +1065,13 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); // #ifdef PADDLE_WITH_CUDA VLOG(3) << "Begin GPUPS PushSparseGrad"; - size_t grad_value_size = - TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); auto buf = memory::Alloc(place, total_length * grad_value_size); - VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_ << "grad_value_size:" << grad_value_size; - float* total_grad_values_gpu = - reinterpret_cast(buf->ptr()); + VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_ + << "grad_value_size:" << grad_value_size; + float* total_grad_values_gpu = reinterpret_cast(buf->ptr()); if (platform::is_cpu_place(place)) { PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in GPUPS now.")); @@ -1058,10 +1083,11 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, uint64_t* total_keys = reinterpret_cast(cached_total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to gpups struct"; - - this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, - total_length, batch_size, grad_value_size); - + + accessor_wrapper_ptr->CopyForPush( + place, grad_values, total_grad_values_gpu, slot_lengths, total_length, + batch_size, grad_value_size, slot_vector_, slot_mf_dim_vector_); + VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index << " len: " << total_length; push_gpups_timer.Start(); @@ -1077,8 +1103,9 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, uint64_t* total_keys = reinterpret_cast(cached_total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to xpups struct"; - this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, - hidden_size, total_length, batch_size); + accessor_wrapper_ptr->CopyForPush(place, grad_values, total_grad_values_gpu, + slot_lengths, hidden_size, total_length, + batch_size, slot_vector_); VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index << " len: " << total_length; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 2f9d5147fb0e4e..cf37737716f4c2 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -25,65 +25,67 @@ limitations under the License. */ namespace paddle { namespace framework { -__global__ void PullCopy(float** dest, const FeatureValue* src, - const int64_t* len, int hidden, int slot_num, - int total_len, uint64_t** keys) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[x - 1] : 0); - if (*(keys[x] + y) == 0) { - *(dest[x] + y * hidden) = 0; - *(dest[x] + y * hidden + 1) = 0; - *(dest[x] + y * hidden + 2) = 0; - } else { - *(dest[x] + y * hidden) = (src + i)->show; - *(dest[x] + y * hidden + 1) = (src + i)->clk; - *(dest[x] + y * hidden + 2) = (src + i)->lr; - } - if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) { - for (int j = 0; j < hidden - 3; j++) { - *(dest[x] + y * hidden + 3 + j) = 0; - } - } else { - for (int j = 0; j < hidden - 3; j++) { - *(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j]; - } - } - } -} - -template -__global__ void PullCopy(float** dest, const float* src, - const int64_t* len, int slot_num, int total_len, - uint64_t** keys, uint64_t max_val_size, int* gpu_dim, - FVAceessor feature_value_accessor) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[x - 1] : 0); - float* feature_value_ptr = - (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); - int mf_dim = gpu_dim[x] - 3; - feature_value_accessor.Select(dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); - } -} +// __global__ void PullCopy(float** dest, const FeatureValue* src, +// const int64_t* len, int hidden, int slot_num, +// int total_len, uint64_t** keys) { +// CUDA_KERNEL_LOOP(i, total_len) { +// int low = 0; +// int high = slot_num - 1; +// while (low < high) { +// int mid = (low + high) / 2; +// if (i < len[mid]) +// high = mid; +// else +// low = mid + 1; +// } +// int x = low; +// int y = i - (x ? len[x - 1] : 0); +// if (*(keys[x] + y) == 0) { +// *(dest[x] + y * hidden) = 0; +// *(dest[x] + y * hidden + 1) = 0; +// *(dest[x] + y * hidden + 2) = 0; +// } else { +// *(dest[x] + y * hidden) = (src + i)->show; +// *(dest[x] + y * hidden + 1) = (src + i)->clk; +// *(dest[x] + y * hidden + 2) = (src + i)->lr; +// } +// if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) { +// for (int j = 0; j < hidden - 3; j++) { +// *(dest[x] + y * hidden + 3 + j) = 0; +// } +// } else { +// for (int j = 0; j < hidden - 3; j++) { +// *(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j]; +// } +// } +// } +// } + +// template +// __global__ void PullCopy(float** dest, const float* src, +// const int64_t* len, int slot_num, int total_len, +// uint64_t** keys, uint64_t max_val_size, int* +// gpu_dim, +// FVAccessor feature_value_accessor) { +// CUDA_KERNEL_LOOP(i, total_len) { +// int low = 0; +// int high = slot_num - 1; +// while (low < high) { +// int mid = (low + high) / 2; +// if (i < len[mid]) +// high = mid; +// else +// low = mid + 1; +// } +// int x = low; +// int y = i - (x ? len[x - 1] : 0); +// float* feature_value_ptr = +// (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); +// int mf_dim = gpu_dim[x] - 3; +// feature_value_accessor.Select(dest[x] + y * (mf_dim + 3), +// feature_value_ptr, keys[x] + y, mf_dim); +// } +// } __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, const int64_t* len, int slot_num, @@ -129,86 +131,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len, } } -template -__global__ void PushCopyWithPool(float* dest, float** src, - int64_t* len, int slot_num, uint64_t total_len, - int bs, int* slot_vector, int* mf_dim_vector, - size_t grad_value_size, - FVAceessor feature_value_accessor) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[low - 1] : 0); - float* cur = - (float*)((char*)dest + i * grad_value_size); - - cur[feature_value_accessor.common_push_value.SlotIndex()] = - (float)slot_vector[x]; - int mf_dim = mf_dim_vector[x]; - cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim; - - cur[feature_value_accessor.common_push_value.ShowIndex()] = - *(src[x] + y * (mf_dim + 3)); - cur[feature_value_accessor.common_push_value.ClickIndex()] = - *(src[x] + y * (mf_dim + 3) + 1); - cur[feature_value_accessor.common_push_value.EmbedGIndex()] = - *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; - for (int j = 0; j < mf_dim; j++) { - cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; - } - } -} PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, - const int64_t total_length) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); - float** gpu_values = reinterpret_cast(buf_value->ptr()); - cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - - PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - gpu_values, total_values_gpu, gpu_len, hidden_size, slot_num, - total_length, gpu_keys); - cudaStreamSynchronize(stream); -} - -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const float* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, - const int64_t total_length, int* gpu_dim) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); - float** gpu_values = reinterpret_cast(buf_value->ptr()); - cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - gpu_values, total_values_gpu, gpu_len, slot_num, total_length, gpu_keys, - val_type_size_, gpu_dim, feature_value_accessor_); - cudaStreamSynchronize(stream); -} - void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, @@ -221,101 +145,108 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, cudaStreamSynchronize(stream); } -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, - const int64_t total_length, - const int batch_size) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto slot_lengths_lod = slot_lengths; - for (int i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_grad_value = - memory::Alloc(place, grad_values.size() * sizeof(float*)); - auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); - auto buf_slot_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - - float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); - - cudaMemcpy(gpu_values, grad_values.data(), - grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_slot_vector, slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); - - PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - total_grad_values_gpu, gpu_values, gpu_len, hidden_size, - slot_lengths.size(), total_length, batch_size, d_slot_vector); - cudaStreamSynchronize(stream); -} - -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - float* total_grad_values_gpu, - const std::vector& slot_lengths, - const uint64_t total_length, - const int batch_size, size_t grad_value_size) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto slot_lengths_lod = slot_lengths; - for (int i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_grad_value = - memory::Alloc(place, grad_values.size() * sizeof(float*)); - auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); - auto buf_slot_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - auto buf_mf_dim_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); - int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); - cudaMemcpy(gpu_values, grad_values.data(), - grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_slot_vector, slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector_.data(), - slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); - PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(), - total_length, batch_size, d_slot_vector, d_mf_dim_vector, - grad_value_size, feature_value_accessor_); - cudaStreamSynchronize(stream); -} +// void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, +// const std::vector& grad_values, +// FeaturePushValue* total_grad_values_gpu, +// const std::vector& slot_lengths, +// const int hidden_size, +// const int64_t total_length, +// const int batch_size) { +// auto stream = dynamic_cast( +// platform::DeviceContextPool::Instance().Get(place)) +// ->stream(); +// auto slot_lengths_lod = slot_lengths; +// for (int i = 1; i < slot_lengths_lod.size(); i++) { +// slot_lengths_lod[i] += slot_lengths_lod[i - 1]; +// } +// auto buf_grad_value = +// memory::Alloc(place, grad_values.size() * sizeof(float*)); +// auto buf_length = memory::Alloc(place, slot_lengths.size() * +// sizeof(int64_t)); +// auto buf_slot_vector = +// memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + +// float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); +// int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); +// int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); + +// cudaMemcpy(gpu_values, grad_values.data(), +// grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); +// cudaMemcpy(gpu_len, slot_lengths_lod.data(), +// slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); +// cudaMemcpy(d_slot_vector, slot_vector_.data(), +// slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); + +// PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( +// total_grad_values_gpu, gpu_values, gpu_len, hidden_size, +// slot_lengths.size(), total_length, batch_size, d_slot_vector); +// cudaStreamSynchronize(stream); +// } + +// void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, +// const std::vector& grad_values, +// float* total_grad_values_gpu, +// const std::vector& slot_lengths, +// const uint64_t total_length, +// const int batch_size, size_t grad_value_size) +// { +// auto stream = dynamic_cast( +// platform::DeviceContextPool::Instance().Get(place)) +// ->stream(); +// auto slot_lengths_lod = slot_lengths; +// for (int i = 1; i < slot_lengths_lod.size(); i++) { +// slot_lengths_lod[i] += slot_lengths_lod[i - 1]; +// } +// auto buf_grad_value = +// memory::Alloc(place, grad_values.size() * sizeof(float*)); +// auto buf_length = memory::Alloc(place, slot_lengths.size() * +// sizeof(int64_t)); +// auto buf_slot_vector = +// memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); +// auto buf_mf_dim_vector = +// memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); +// float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); +// int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); +// int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); +// int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); +// cudaMemcpy(gpu_values, grad_values.data(), +// grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); +// cudaMemcpy(gpu_len, slot_lengths_lod.data(), +// slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); +// cudaMemcpy(d_slot_vector, slot_vector_.data(), +// slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); +// cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector_.data(), +// slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); +// auto accessor_wrapper_ptr = +// GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); +// PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( +// total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(), +// total_length, batch_size, d_slot_vector, d_mf_dim_vector, +// grad_value_size, accessor_wrapper_ptr->GetGPUAccessor()); +// cudaStreamSynchronize(stream); +// } void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float min_bound, float max_bound, float learning_rate, float initial_g2sum, float initial_range, float beta1_decay_rate, float beta2_decay_rate, float ada_epsilon) { - optimizer_config_.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound, - learning_rate, initial_g2sum, initial_range, - beta1_decay_rate, beta2_decay_rate, ada_epsilon); + optimizer_config_.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, + max_bound, learning_rate, initial_g2sum, + initial_range, beta1_decay_rate, + beta2_decay_rate, ada_epsilon); } void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, float mf_learning_rate, float mf_initial_g2sum, float mf_initial_range, float mf_min_bound, float mf_max_bound, float mf_beta1_decay_rate, - float mf_beta2_decay_rate, float mf_ada_epsilon) { - optimizer_config_.set_embedx_sgd(mf_create_thresholds, mf_learning_rate, - mf_initial_g2sum, mf_initial_range, - mf_min_bound, mf_max_bound, mf_beta1_decay_rate, - mf_beta2_decay_rate, mf_ada_epsilon); + float mf_beta2_decay_rate, + float mf_ada_epsilon) { + optimizer_config_.set_embedx_sgd( + mf_create_thresholds, mf_learning_rate, mf_initial_g2sum, + mf_initial_range, mf_min_bound, mf_max_bound, mf_beta1_decay_rate, + mf_beta2_decay_rate, mf_ada_epsilon); } } // end namespace framework diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 6da628db724555..73c756f8bf73f1 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -27,6 +27,7 @@ limitations under the License. */ #include #ifdef PADDLE_WITH_GLOO #include + #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif @@ -50,10 +51,10 @@ limitations under the License. */ #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/ps/wrapper/fleet.h" +#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" -#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #endif #ifdef PADDLE_WITH_PSLIB #include "afs_api.h" @@ -65,9 +66,6 @@ limitations under the License. */ namespace paddle { namespace framework { -#define TYPEALIGN(ALIGNVAL, LEN) \ - (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) - class Dataset; #ifdef PADDLE_WITH_PSLIB @@ -130,28 +128,28 @@ class PSGPUWrapper { void CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, int total_len); - void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, const int64_t* gpu_len, - const int slot_num, const int hidden_size, - const int64_t total_length); - void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, - const std::vector& values, - const float* total_values_gpu, const int64_t* gpu_len, - const int slot_num, const int hidden_size, - const int64_t total_length, int* gpu_dim); - void CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, const int64_t total_length, - const int batch_size); - void CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - float* total_grad_values_gpu, - const std::vector& slot_lengths, - const uint64_t total_length, const int batch_size, - size_t grad_value_size); + // void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, + // const std::vector& values, + // const FeatureValue* total_values_gpu, const int64_t* + // gpu_len, const int slot_num, const int hidden_size, const + // int64_t total_length); + // void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, + // const std::vector& values, + // const float* total_values_gpu, const int64_t* gpu_len, + // const int slot_num, const int hidden_size, + // const int64_t total_length, int* gpu_dim); + // void CopyForPush(const paddle::platform::Place& place, + // const std::vector& grad_values, + // FeaturePushValue* total_grad_values_gpu, + // const std::vector& slot_lengths, + // const int hidden_size, const int64_t total_length, + // const int batch_size); + // void CopyForPush(const paddle::platform::Place& place, + // const std::vector& grad_values, + // float* total_grad_values_gpu, + // const std::vector& slot_lengths, + // const uint64_t total_length, const int batch_size, + // size_t grad_value_size); void BuildGPUTask(std::shared_ptr gpu_task); void PreBuildTask(std::shared_ptr gpu_task); @@ -259,7 +257,7 @@ class PSGPUWrapper { float mf_min_bound, float mf_max_bound, float mf_beta1_decay_rate, float mf_beta2_decay_rate, float mf_ada_epsilon); - + #ifdef PADDLE_WITH_PSCORE void add_sparse_optimizer( std::unordered_map& config, // NOLINT @@ -309,11 +307,11 @@ class PSGPUWrapper { void InitializeGPUServer(paddle::distributed::PSParameter ps_param) { auto sparse_table = - ps_param.server_param().downpour_server_param().downpour_table_param(0); + ps_param.server_param().downpour_server_param().downpour_table_param(0); auto sparse_table_accessor = sparse_table.accessor(); auto sparse_table_accessor_parameter = sparse_table_accessor.ctr_accessor_param(); - auto accessor_class = sparse_table_accessor.accessor_class(); + accessor_class_ = sparse_table_accessor.accessor_class(); std::unordered_map config; config["embedx_dim"] = sparse_table_accessor.embedx_dim(); @@ -321,17 +319,20 @@ class PSGPUWrapper { config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff(); config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); - if (accessor_class == "CtrDymfAccessor") { + if (accessor_class_ == "CtrDymfAccessor") { // optimizer config for embed_w and embedx add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param()); add_sparse_optimizer(config, sparse_table_accessor.embedx_sgd_param(), - "mf_"); + "mf_"); } - feature_value_accessor_.Configure(config); + fleet_config_ = config; + GlobalAccessorTransfor::GetInstance().Init(accessor_class_); + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper()->Configure( + config); InitializeGPUServer(config); } - #endif +#endif void InitializeGPUServer(std::unordered_map config) { float nonclk_coeff = (config.find("nonclk_coeff") == config.end()) @@ -342,9 +343,8 @@ class PSGPUWrapper { float min_bound = (config.find("min_bound") == config.end()) ? -10.0 : config["min_bound"]; - float max_bound = (config.find("max_bound") == config.end()) - ? 10.0 - : config["max_bound"]; + float max_bound = + (config.find("max_bound") == config.end()) ? 10.0 : config["max_bound"]; float learning_rate = (config.find("learning_rate") == config.end()) ? 0.05 : config["learning_rate"]; @@ -361,8 +361,8 @@ class PSGPUWrapper { ? 0.999 : config["beta2_decay_rate"]; float ada_epsilon = (config.find("ada_epsilon") == config.end()) - ? 1e-8 - : config["ada_epsilon"]; + ? 1e-8 + : config["ada_epsilon"]; // mf config settings float mf_create_thresholds = (config.find("mf_create_thresholds") == config.end()) @@ -383,35 +383,37 @@ class PSGPUWrapper { float mf_max_bound = (config.find("mf_max_bound") == config.end()) ? 10.0 : config["mf_max_bound"]; - float mf_beta1_decay_rate = (config.find("mf_beta1_decay_rate") == config.end()) - ? 0.9 - : config["mf_beta1_decay_rate"]; - float mf_beta2_decay_rate = (config.find("mf_beta2_decay_rate") == config.end()) - ? 0.999 - : config["mf_beta2_decay_rate"]; + float mf_beta1_decay_rate = + (config.find("mf_beta1_decay_rate") == config.end()) + ? 0.9 + : config["mf_beta1_decay_rate"]; + float mf_beta2_decay_rate = + (config.find("mf_beta2_decay_rate") == config.end()) + ? 0.999 + : config["mf_beta2_decay_rate"]; float mf_ada_epsilon = (config.find("mf_ada_epsilon") == config.end()) - ? 1e-8 - : config["mf_ada_epsilon"]; + ? 1e-8 + : config["mf_ada_epsilon"]; this->SetSparseSGD(nonclk_coeff, clk_coeff, min_bound, max_bound, - learning_rate, initial_g2sum, initial_range, - beta1_decay_rate, beta2_decay_rate, ada_epsilon); - this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate, - mf_initial_g2sum, mf_initial_range, mf_min_bound, - mf_max_bound, mf_beta1_decay_rate, mf_beta2_decay_rate, - mf_ada_epsilon); + learning_rate, initial_g2sum, initial_range, + beta1_decay_rate, beta2_decay_rate, ada_epsilon); + this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate, mf_initial_g2sum, + mf_initial_range, mf_min_bound, mf_max_bound, + mf_beta1_decay_rate, mf_beta2_decay_rate, + mf_ada_epsilon); // set optimizer type(naive,adagrad,std_adagrad,adam,share_adam) optimizer_type_ = (config.find("optimizer_type") == config.end()) - ? 1 - : int(config["optimizer_type"]); + ? 1 + : int(config["optimizer_type"]); embedx_dim_ = (config.find("embedx_dim") == config.end()) - ? 8 - : int(config["embedx_dim"]); - if (optimizer_type_ == 3) { //adam + ? 8 + : int(config["embedx_dim"]); + if (optimizer_type_ == 3) { // adam embed_sgd_dim_ = 4; embedx_sgd_dim_ = embedx_dim_ * 2 + 2; - } else if (optimizer_type_ == 4) { //shared_adam + } else if (optimizer_type_ == 4) { // shared_adam embed_sgd_dim_ = 4; embedx_sgd_dim_ = 4; } else { @@ -419,8 +421,9 @@ class PSGPUWrapper { embedx_sgd_dim_ = 1; } - VLOG(0) << "InitializeGPUServer embed_sgd_dim_:" << embed_sgd_dim_ << " embedx_sgd_dim_:" - << embedx_sgd_dim_ << " embedx_dim_:" << embedx_dim_ + VLOG(0) << "InitializeGPUServer embed_sgd_dim_:" << embed_sgd_dim_ + << " embedx_sgd_dim_:" << embedx_sgd_dim_ + << " embedx_dim_:" << embedx_dim_ << " optimizer_type_:" << optimizer_type_; } @@ -507,9 +510,13 @@ class PSGPUWrapper { for (size_t i = 0; i < slot_index_vec_.size(); i++) { slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]]; } - val_type_size_ = TYPEALIGN(8, feature_value_accessor_.common_feature_value.Size(max_mf_dim_)); - grad_type_size_ = TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); - VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_ << " grad_type_size_:" << grad_type_size_; + + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); + VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_ + << " grad_type_size_:" << grad_type_size_; slot_info_initialized_ = true; } #endif @@ -530,11 +537,11 @@ class PSGPUWrapper { #ifdef PADDLE_WITH_PSCORE void SetTableAccessor(paddle::distributed::ValueAccessor* accessor) { - cpu_table_accessor_ = dynamic_cast(accessor); + cpu_table_accessor_ = + dynamic_cast(accessor); } #endif - CommonFeatureValueAccessor feature_value_accessor_; private: static std::shared_ptr s_instance_; Dataset* dataset_; @@ -591,6 +598,8 @@ class PSGPUWrapper { int embed_sgd_dim_ = 1; int embedx_sgd_dim_ = 1; int embedx_dim_ = 8; + std::string accessor_class_; + std::unordered_map fleet_config_; #ifdef PADDLE_WITH_PSCORE paddle::distributed::CtrDymfAccessor* cpu_table_accessor_; #endif @@ -619,6 +628,7 @@ class PSGPUWrapper { std::vector> pull_thread_pool_; std::vector> hbm_thread_pool_; OptimizerConfig optimizer_config_; + protected: static bool is_initialized_; }; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps index f1084dc4d758bc..b9a9b961ecf859 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps @@ -169,32 +169,33 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len, PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, - const int64_t total_length) { - XPUStream stream = nullptr; - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx) - ->x_context() - ->xpu_stream; - float* buf_value = nullptr; - xpu_malloc(reinterpret_cast(&buf_value), - values.size() * sizeof(float*)); - float** gpu_values = reinterpret_cast(&buf_value); - xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*), - XPU_HOST_TO_DEVICE); +// void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, +// uint64_t** gpu_keys, +// const std::vector& values, +// const FeatureValue* total_values_gpu, +// const int64_t* gpu_len, const int slot_num, +// const int hidden_size, +// const int64_t total_length) { +// XPUStream stream = nullptr; +// auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); +// stream = static_cast(dev_ctx) +// ->x_context() +// ->xpu_stream; +// float* buf_value = nullptr; +// xpu_malloc(reinterpret_cast(&buf_value), +// values.size() * sizeof(float*)); +// float** gpu_values = reinterpret_cast(&buf_value); +// xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*), +// XPU_HOST_TO_DEVICE); - unsigned long long** c_keys = (unsigned long long**)gpu_keys; - const long long* c_len = (const long long*)gpu_len; - PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size, - slot_num, total_length, c_keys); +// unsigned long long** c_keys = (unsigned long long**)gpu_keys; +// const long long* c_len = (const long long*)gpu_len; +// PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, +// hidden_size, +// slot_num, total_length, c_keys); - xpu_wait(stream); -} +// xpu_wait(stream); +// } void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, @@ -213,50 +214,50 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, xpu_wait(stream); } -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, - const int64_t total_length, - const int batch_size) { - XPUStream stream = nullptr; - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx) - ->x_context() - ->xpu_stream; - auto slot_lengths_lod = slot_lengths; - for (size_t i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } +// void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, +// const std::vector& grad_values, +// FeaturePushValue* total_grad_values_gpu, +// const std::vector& slot_lengths, +// const int hidden_size, +// const int64_t total_length, +// const int batch_size) { +// XPUStream stream = nullptr; +// auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); +// stream = static_cast(dev_ctx) +// ->x_context() +// ->xpu_stream; +// auto slot_lengths_lod = slot_lengths; +// for (size_t i = 1; i < slot_lengths_lod.size(); i++) { +// slot_lengths_lod[i] += slot_lengths_lod[i - 1]; +// } - float* buf_grad_value = nullptr; - int64_t* buf_length = nullptr; - int* buf_slot_vector = nullptr; +// float* buf_grad_value = nullptr; +// int64_t* buf_length = nullptr; +// int* buf_slot_vector = nullptr; - xpu_malloc(reinterpret_cast(&buf_grad_value), - grad_values.size() * sizeof(float*)); - xpu_malloc(reinterpret_cast(&buf_length), - slot_lengths.size() * sizeof(int64_t)); - xpu_malloc(reinterpret_cast(&buf_slot_vector), - slot_lengths_lod.size() * sizeof(int)); +// xpu_malloc(reinterpret_cast(&buf_grad_value), +// grad_values.size() * sizeof(float*)); +// xpu_malloc(reinterpret_cast(&buf_length), +// slot_lengths.size() * sizeof(int64_t)); +// xpu_malloc(reinterpret_cast(&buf_slot_vector), +// slot_lengths_lod.size() * sizeof(int)); - float** gpu_values = reinterpret_cast(&buf_grad_value); - int64_t* gpu_len = reinterpret_cast(buf_length); - int* d_slot_vector = reinterpret_cast(buf_slot_vector); - xpu_memcpy(gpu_values, grad_values.data(), - grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE); - xpu_memcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE); - xpu_memcpy(d_slot_vector, slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE); +// float** gpu_values = reinterpret_cast(&buf_grad_value); +// int64_t* gpu_len = reinterpret_cast(buf_length); +// int* d_slot_vector = reinterpret_cast(buf_slot_vector); +// xpu_memcpy(gpu_values, grad_values.data(), +// grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE); +// xpu_memcpy(gpu_len, slot_lengths_lod.data(), +// slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE); +// xpu_memcpy(d_slot_vector, slot_vector_.data(), +// slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE); - long long* c_len = (long long*)gpu_len; - PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len, - hidden_size, slot_lengths.size(), total_length, - batch_size, d_slot_vector); - xpu_wait(stream); -} +// long long* c_len = (long long*)gpu_len; +// PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len, +// hidden_size, slot_lengths.size(), total_length, +// batch_size, d_slot_vector); +// xpu_wait(stream); +// } } // end namespace framework } // end namespace paddle From 9133850b665c53200411a9e562227eb08413ed8b Mon Sep 17 00:00:00 2001 From: danleifeng Date: Wed, 6 Jul 2022 14:40:10 +0000 Subject: [PATCH 04/12] add feature_value.cu;test=develop --- cmake/cuda.cmake | 3 +- .../framework/fleet/heter_ps/feature_value.cu | 155 ++++++++++++++++++ .../framework/fleet/heter_ps/feature_value.h | 59 +++---- .../framework/fleet/heter_ps/hashtable.h | 4 - .../fluid/framework/fleet/ps_gpu_wrapper.cc | 31 ++-- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 27 +-- 6 files changed, 204 insertions(+), 75 deletions(-) create mode 100644 paddle/fluid/framework/fleet/heter_ps/feature_value.cu diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 4894d615c2a353..5c6bf86811e64c 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -219,7 +219,8 @@ add_definitions("-DCUDA_VERSION_MINOR=\"${CUDA_VERSION_MINOR}\"") add_definitions("-DCUDA_TOOLKIT_ROOT_DIR=\"${CUDA_TOOLKIT_ROOT_DIR}\"") # setting nvcc arch flags -select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) +#select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) +set(NVCC_FLAGS_EXTRA "-gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}") message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}") diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu new file mode 100644 index 00000000000000..eff345fe44caa8 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -0,0 +1,155 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" + + +namespace paddle { +namespace framework { + + +template +__global__ void PullCopy(float** dest, const float* src, + const int64_t* len, int slot_num, int total_len, + uint64_t** keys, uint64_t max_val_size, int* gpu_dim, + FVAccessor feature_value_accessor) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[x - 1] : 0); + float* feature_value_ptr = + (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); + int mf_dim = gpu_dim[x] - 3; + feature_value_accessor.Select(dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); + } +} + +template +__global__ void PushCopyWithPool(float* dest, float** src, + int64_t* len, int slot_num, uint64_t total_len, + int bs, int* slot_vector, int* mf_dim_vector, + size_t grad_value_size, + FVAccessor feature_value_accessor) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[low - 1] : 0); + float* cur = + (float*)((char*)dest + i * grad_value_size); + + cur[feature_value_accessor.common_push_value.SlotIndex()] = + (float)slot_vector[x]; + int mf_dim = mf_dim_vector[x]; + cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim; + + cur[feature_value_accessor.common_push_value.ShowIndex()] = + *(src[x] + y * (mf_dim + 3)); + cur[feature_value_accessor.common_push_value.ClickIndex()] = + *(src[x] + y * (mf_dim + 3) + 1); + cur[feature_value_accessor.common_push_value.EmbedGIndex()] = + *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; + for (int j = 0; j < mf_dim; j++) { + cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; + } + } +} + +template +void AccessorWrapper::CopyForPullImpl(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); + float** gpu_values = reinterpret_cast(buf_value->ptr()); + cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + gpu_values, total_values_gpu, gpu_len, slot_num, total_length, gpu_keys, + feature_value_size, gpu_dim, gpu_accessor_); + cudaStreamSynchronize(stream); +} + +template +void AccessorWrapper::CopyForPushImpl(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto slot_lengths_lod = slot_lengths; + for (int i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_grad_value = + memory::Alloc(place, grad_values.size() * sizeof(float*)); + auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); + auto buf_slot_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + auto buf_mf_dim_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); + int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); + cudaMemcpy(gpu_values, grad_values.data(), + grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_slot_vector, slot_vector.data(), + slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector.data(), + slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); + PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(), + total_length, batch_size, d_slot_vector, d_mf_dim_vector, + grad_value_size, gpu_accessor_); + cudaStreamSynchronize(stream); +} + +#ifdef PADDLE_WITH_PSCORE +template class AccessorWrapper; +#endif + +} +} +#endif \ No newline at end of file diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index 7569c26c132576..f5c19fc87b835b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -291,43 +291,44 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { // // build阶段从cpu_val赋值给gpu_val __host__ void BuildFill( float* gpu_val, void* cpu, - paddle::distributed::CtrDymfAccessor* cpu_table_accessor, int mf_dim) { + paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) { #ifdef PADDLE_WITH_PSCORE + paddle::distributed::CtrDymfAccessor* cpu_accessor = dynamic_cast(cpu_table_accessor); paddle::distributed::FixedFeatureValue* cpu_ptr = (paddle::distributed::FixedFeatureValue*)(cpu); float* cpu_val = cpu_ptr->data(); size_t cpu_dim = cpu_ptr->size(); gpu_val[common_feature_value.DeltaScoreIndex()] = - cpu_val[cpu_table_accessor->common_feature_value.DeltaScoreIndex()]; + cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()]; gpu_val[common_feature_value.ShowIndex()] = - cpu_val[cpu_table_accessor->common_feature_value.ShowIndex()]; + cpu_val[cpu_accessor->common_feature_value.ShowIndex()]; gpu_val[common_feature_value.ClickIndex()] = - cpu_val[cpu_table_accessor->common_feature_value.ClickIndex()]; + cpu_val[cpu_accessor->common_feature_value.ClickIndex()]; gpu_val[common_feature_value.SlotIndex()] = - cpu_val[cpu_table_accessor->common_feature_value.SlotIndex()]; + cpu_val[cpu_accessor->common_feature_value.SlotIndex()]; gpu_val[common_feature_value.EmbedWIndex()] = - cpu_val[cpu_table_accessor->common_feature_value.EmbedWIndex()]; + cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()]; for (int i = 0; i < common_feature_value.EmbedDim(); i++) { gpu_val[common_feature_value.EmbedG2SumIndex() + i] = - cpu_val[cpu_table_accessor->common_feature_value.EmbedG2SumIndex() + + cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i]; } *(reinterpret_cast( gpu_val + common_feature_value.CpuPtrIndex())) = (uint64_t)(cpu); - cpu_val[cpu_table_accessor->common_feature_value.MfDimIndex()] = + cpu_val[cpu_accessor->common_feature_value.MfDimIndex()] = float(mf_dim); gpu_val[common_feature_value.MfDimIndex()] = mf_dim; if (cpu_dim > - cpu_table_accessor->GetAccessorInfo().dim - - cpu_table_accessor->GetAccessorInfo().mf_size / sizeof(float)) { + cpu_accessor->GetAccessorInfo().dim - + cpu_accessor->GetAccessorInfo().mf_size / sizeof(float)) { gpu_val[common_feature_value.MfSizeIndex()] = common_feature_value.MFSize(mf_dim) / sizeof(float); for (int x = 0; x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); x++) { gpu_val[common_feature_value.EmbedxG2SumIndex() + x] = cpu_val - [cpu_table_accessor->common_feature_value.EmbedxG2SumIndex() + x]; + [cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x]; } } else { gpu_val[common_feature_value.MfSizeIndex()] = 0; @@ -340,43 +341,45 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { } // dump_to_cpu阶段从gpu_val赋值给cpu_val - __host__ __device__ void DumpFill( - float* gpu_val, paddle::distributed::CtrDymfAccessor* cpu_table_accessor, + __host__ void DumpFill( + float* gpu_val, paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) { #ifdef PADDLE_WITH_PSCORE + paddle::distributed::CtrDymfAccessor* cpu_accessor = dynamic_cast(cpu_table_accessor); + auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*(reinterpret_cast( gpu_val + common_feature_value.CpuPtrIndex()))); size_t downpour_value_size = downpour_value->size(); if (gpu_val[common_feature_value.MfSizeIndex()] > 0 && downpour_value_size == - (cpu_table_accessor->GetAccessorInfo().dim - - int(cpu_table_accessor->GetAccessorInfo().mf_size / + (cpu_accessor->GetAccessorInfo().dim - + int(cpu_accessor->GetAccessorInfo().mf_size / sizeof(float)))) { // cpu_accessor downpour_value->resize( - cpu_table_accessor->common_feature_value.Dim(mf_dim)); + cpu_accessor->common_feature_value.Dim(mf_dim)); } float* cpu_val = downpour_value->data(); - cpu_val[cpu_table_accessor->common_feature_value.DeltaScoreIndex()] = + cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()] = gpu_val[common_feature_value.DeltaScoreIndex()]; - cpu_val[cpu_table_accessor->common_feature_value.ShowIndex()] = + cpu_val[cpu_accessor->common_feature_value.ShowIndex()] = gpu_val[common_feature_value.ShowIndex()]; - cpu_val[cpu_table_accessor->common_feature_value.ClickIndex()] = + cpu_val[cpu_accessor->common_feature_value.ClickIndex()] = gpu_val[common_feature_value.ClickIndex()]; - cpu_val[cpu_table_accessor->common_feature_value.EmbedWIndex()] = + cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()] = gpu_val[common_feature_value.EmbedWIndex()]; - cpu_val[cpu_table_accessor->common_feature_value.SlotIndex()] = + cpu_val[cpu_accessor->common_feature_value.SlotIndex()] = gpu_val[common_feature_value.SlotIndex()]; for (int i = 0; i < common_feature_value.EmbedDim(); i++) { - cpu_val[cpu_table_accessor->common_feature_value.EmbedG2SumIndex() + i] = + cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i] = gpu_val[common_feature_value.EmbedG2SumIndex() + i]; } if (gpu_val[common_feature_value.MfSizeIndex()] > 0) { for (int x = 0; x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); x++) { - cpu_val[cpu_table_accessor->common_feature_value.EmbedxG2SumIndex() + + cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x] = gpu_val[common_feature_value.EmbedxG2SumIndex() + x]; } } @@ -627,14 +630,12 @@ class VirtualAccessor { virtual size_t GetPushValueSize(int& mf_dim) = 0; - // TODO: 在基类里调用cpu_table_accessor类型 virtual void BuildFill( void* gpu_val, void* cpu_val, - paddle::distributed::CtrDymfAccessor* cpu_table_accessor, int mf_dim) = 0; + paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) = 0; - // TODO: 在基类里调用cpu_table_accessor类型 virtual void DumpFill( - float* gpu_val, paddle::distributed::CtrDymfAccessor* cpu_table_accessor, + float* gpu_val, paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) = 0; virtual void CopyForPull(const paddle::platform::Place& place, @@ -679,13 +680,13 @@ class AccessorWrapper : public VirtualAccessor { virtual void BuildFill( void* gpu_val, void* cpu_val, - paddle::distributed::CtrDymfAccessor* cpu_table_accessor, int mf_dim) { + paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) { gpu_accessor_.BuildFill((float*)(gpu_val), cpu_val, cpu_table_accessor, mf_dim); } virtual void DumpFill( - float* gpu_val, paddle::distributed::CtrDymfAccessor* cpu_table_accessor, + float* gpu_val, paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) { gpu_accessor_.DumpFill(gpu_val, cpu_table_accessor, mf_dim); } diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index 0d54f87aef0b5a..2e4fd943b728ee 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -169,14 +169,10 @@ class HashTable { << " push value size: " << push_grad_value_size_; } - // void set_accessor(FVAccessor& accessor) { - // feature_value_accessor_ = accessor; - // } void show_collision(int id) { return container_->print_collision(id); } std::unique_ptr rwlock_{nullptr}; - // FVAccessor feature_value_accessor_; private: #if defined(PADDLE_WITH_CUDA) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 1283be92e19e51..dc9619e39c60dd 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -608,10 +608,9 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { new MemoryPool(len, feature_value_size); auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; for (size_t k = 0; k < len; k++) { - // float* val = (float*)(mem_pool->mem_address(k)); void* val = mem_pool->mem_address(k); - float* ptr_val = device_dim_ptrs[k]->data(); - size_t dim = device_dim_ptrs[k]->size(); + // float* ptr_val = device_dim_ptrs[k]->data(); + // size_t dim = device_dim_ptrs[k]->size(); #ifdef PADDLE_WITH_PSLIB val->delta_score = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: @@ -647,9 +646,9 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { } #endif #ifdef PADDLE_WITH_PSCORE - VLOG(5) << "cpu build " << k - << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) - << " |: " << cpu_table_accessor_->ParseToString(ptr_val, dim); + // VLOG(5) << "cpu build " << k + // << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) + // << " |: " << cpu_table_accessor_->ParseToString(ptr_val, dim); accessor_wrapper_ptr->BuildFill(val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim); VLOG(5) << "build " << k << " : " @@ -860,17 +859,15 @@ void PSGPUWrapper::EndPass() { #endif #ifdef PADDLE_WITH_PSCORE accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); - auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*( - reinterpret_cast(gpu_val))); - float* cpu_val = downpour_value->data(); - VLOG(5) << "dump to cpu " << index << " gpu_value: " - << accessor_wrapper_ptr->ParseToString( - gpu_val, - int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / - sizeof(float))) - << " \t cpu_value:" - << cpu_table_accessor_->ParseToString(cpu_val, - downpour_value->size()); + // auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*( + // reinterpret_cast(gpu_val))); + // float* cpu_val = downpour_value->data(); + // VLOG(5) << "dump to cpu " << index << " gpu_value: " + // << accessor_wrapper_ptr->ParseToString(gpu_val, + // int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / sizeof(float))) + // << " \t cpu_value:" + // << cpu_table_accessor_->ParseToString(cpu_val, + // downpour_value->size()); } #endif free(test_build_values); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 73c756f8bf73f1..6369a68f67d61b 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -407,24 +407,7 @@ class PSGPUWrapper { optimizer_type_ = (config.find("optimizer_type") == config.end()) ? 1 : int(config["optimizer_type"]); - embedx_dim_ = (config.find("embedx_dim") == config.end()) - ? 8 - : int(config["embedx_dim"]); - if (optimizer_type_ == 3) { // adam - embed_sgd_dim_ = 4; - embedx_sgd_dim_ = embedx_dim_ * 2 + 2; - } else if (optimizer_type_ == 4) { // shared_adam - embed_sgd_dim_ = 4; - embedx_sgd_dim_ = 4; - } else { - embed_sgd_dim_ = 1; - embedx_sgd_dim_ = 1; - } - - VLOG(0) << "InitializeGPUServer embed_sgd_dim_:" << embed_sgd_dim_ - << " embedx_sgd_dim_:" << embedx_sgd_dim_ - << " embedx_dim_:" << embedx_dim_ - << " optimizer_type_:" << optimizer_type_; + } void SetDate(int year, int month, int day) { @@ -537,8 +520,7 @@ class PSGPUWrapper { #ifdef PADDLE_WITH_PSCORE void SetTableAccessor(paddle::distributed::ValueAccessor* accessor) { - cpu_table_accessor_ = - dynamic_cast(accessor); + cpu_table_accessor_ = accessor; } #endif @@ -595,13 +577,10 @@ class PSGPUWrapper { bool slot_info_initialized_ = false; int use_afs_api_ = 0; int optimizer_type_ = 1; - int embed_sgd_dim_ = 1; - int embedx_sgd_dim_ = 1; - int embedx_dim_ = 8; std::string accessor_class_; std::unordered_map fleet_config_; #ifdef PADDLE_WITH_PSCORE - paddle::distributed::CtrDymfAccessor* cpu_table_accessor_; + paddle::distributed::ValueAccessor* cpu_table_accessor_; #endif #ifdef PADDLE_WITH_CUDA From b03be497c4236c0357895c33ae799f727144940e Mon Sep 17 00:00:00 2001 From: danleifeng Date: Thu, 7 Jul 2022 13:30:41 +0800 Subject: [PATCH 05/12] Merge pull request --- .../ps/table/common_graph_table.cc | 524 ++++++++++-------- .../distributed/ps/table/common_graph_table.h | 112 ++-- .../distributed/ps/table/graph/graph_node.h | 95 +++- paddle/fluid/framework/data_feed.cu | 6 +- paddle/fluid/framework/device_worker.cc | 31 +- .../framework/fleet/heter_ps/heter_comm.h | 17 +- .../framework/fleet/heter_ps/heter_comm_inl.h | 170 +++++- .../fleet/heter_ps/heter_comm_kernel.cu | 121 +++- .../fleet/heter_ps/heter_comm_kernel.h | 17 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 4 +- paddle/fluid/framework/hogwild_worker.cc | 6 +- paddle/fluid/platform/flags.cc | 6 + paddle/utils/string/string_helper.h | 112 ++++ 13 files changed, 879 insertions(+), 342 deletions(-) diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index dd26f7ec41d92c..c24f44c45e3b7d 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -13,19 +13,21 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/table/common_graph_table.h" + #include + #include +#include #include #include #include -#include #include "gflags/gflags.h" -#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/distributed/common/utils.h" -#include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/string_helper.h" @@ -55,8 +57,8 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea( std::vector &node_ids, int slot_num) { std::vector> bags(task_pool_size_); for (int i = 0; i < task_pool_size_; i++) { - auto predsize = node_ids.size() / task_pool_size_; - bags[i].reserve(predsize * 1.2); + auto predsize = node_ids.size() / task_pool_size_; + bags[i].reserve(predsize * 1.2); } for (auto x : node_ids) { @@ -145,32 +147,31 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( } std::vector> tasks; - std::vector node_array[task_pool_size_]; // node id list + std::vector node_array[task_pool_size_]; // node id list std::vector info_array[task_pool_size_]; - std::vector edge_array[task_pool_size_]; // edge id list + std::vector edge_array[task_pool_size_]; // edge id list for (size_t i = 0; i < bags.size(); i++) { if (bags[i].size() > 0) { tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int { - node_array[i].resize(bags[i].size()); - info_array[i].resize(bags[i].size()); - edge_array[i].reserve(bags[i].size()); + node_array[i].resize(bags[i].size()); + info_array[i].resize(bags[i].size()); + edge_array[i].reserve(bags[i].size()); for (size_t j = 0; j < bags[i].size(); j++) { - auto node_id = bags[i][j]; - node_array[i][j] = node_id; - Node *v = find_node(0, idx, node_id); - if (v != nullptr) { - info_array[i][j].neighbor_offset = edge_array[i].size(); - info_array[i][j].neighbor_size = v->get_neighbor_size(); - for (size_t k = 0; k < v->get_neighbor_size(); k++) { - edge_array[i].push_back(v->get_neighbor_id(k)); - } - } - else { - info_array[i][j].neighbor_offset = 0; - info_array[i][j].neighbor_size = 0; + auto node_id = bags[i][j]; + node_array[i][j] = node_id; + Node *v = find_node(0, idx, node_id); + if (v != nullptr) { + info_array[i][j].neighbor_offset = edge_array[i].size(); + info_array[i][j].neighbor_size = v->get_neighbor_size(); + for (size_t k = 0; k < v->get_neighbor_size(); k++) { + edge_array[i].push_back(v->get_neighbor_id(k)); } + } else { + info_array[i][j].neighbor_offset = 0; + info_array[i][j].neighbor_size = 0; + } } return 0; })); @@ -288,7 +289,6 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx, for (size_t i = 0; i < bags.size(); i++) { if (bags[i].size() > 0) { tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int { - char ch[sizeof(int) * 2 + sizeof(uint64_t)]; memset(ch, 0, sizeof(int)); memcpy(ch + sizeof(int), &idx, sizeof(int)); @@ -429,7 +429,6 @@ void GraphTable::export_partition_files(int idx, std::string file_path) { for (int i = 0; i < part_len; i++) { tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( [&, i, idx, this]() -> int { - std::string output_path = file_path + "partition_" + std::to_string(i); @@ -1060,59 +1059,66 @@ std::string GraphTable::get_inverse_etype(std::string &etype) { return res; } -int32_t GraphTable::load_node_and_edge_file(std::string etype, std::string ntype, std::string epath, - std::string npath, int part_num, bool reverse) { +int32_t GraphTable::load_node_and_edge_file(std::string etype, + std::string ntype, + std::string epath, + std::string npath, int part_num, + bool reverse) { auto etypes = paddle::string::split_string(etype, ","); auto ntypes = paddle::string::split_string(ntype, ","); VLOG(0) << "etypes size: " << etypes.size(); VLOG(0) << "whether reverse: " << reverse; std::string delim = ";"; - size_t total_len = etypes.size() + 1; // 1 is for node + size_t total_len = etypes.size() + 1; // 1 is for node std::vector> tasks; for (size_t i = 0; i < total_len; i++) { - tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( - [&, i, this]() ->int { - if (i < etypes.size()) { - std::string etype_path = epath + "/" + etypes[i]; - auto etype_path_list = paddle::framework::localfs_list(etype_path); - std::string etype_path_str; - if (part_num > 0 && part_num < (int)etype_path_list.size()) { - std::vector sub_etype_path_list(etype_path_list.begin(), etype_path_list.begin() + part_num); - etype_path_str = boost::algorithm::join(sub_etype_path_list, delim); - } else { - etype_path_str = boost::algorithm::join(etype_path_list, delim); - } - this->load_edges(etype_path_str, false, etypes[i]); - if (reverse) { - std::string r_etype = get_inverse_etype(etypes[i]); - this->load_edges(etype_path_str, true, r_etype); - } - } else { - auto npath_list = paddle::framework::localfs_list(npath); - std::string npath_str; - if (part_num > 0 && part_num < (int)npath_list.size()) { - std::vector sub_npath_list(npath_list.begin(), npath_list.begin() + part_num); - npath_str = boost::algorithm::join(sub_npath_list, delim); - } else { - npath_str = boost::algorithm::join(npath_list, delim); - } - - if (ntypes.size() == 0) { - VLOG(0) << "node_type not specified, nothing will be loaded "; - return 0; - } else { - for (size_t i = 0; i < ntypes.size(); i++) { - if (feature_to_id.find(ntypes[i]) == feature_to_id.end()) { - VLOG(0) << "node_type " << ntypes[i] << "is not defined, will not load"; + tasks.push_back( + _shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int { + if (i < etypes.size()) { + std::string etype_path = epath + "/" + etypes[i]; + auto etype_path_list = paddle::framework::localfs_list(etype_path); + std::string etype_path_str; + if (part_num > 0 && part_num < (int)etype_path_list.size()) { + std::vector sub_etype_path_list( + etype_path_list.begin(), etype_path_list.begin() + part_num); + etype_path_str = + boost::algorithm::join(sub_etype_path_list, delim); + } else { + etype_path_str = boost::algorithm::join(etype_path_list, delim); + } + this->load_edges(etype_path_str, false, etypes[i]); + if (reverse) { + std::string r_etype = get_inverse_etype(etypes[i]); + this->load_edges(etype_path_str, true, r_etype); + } + } else { + auto npath_list = paddle::framework::localfs_list(npath); + std::string npath_str; + if (part_num > 0 && part_num < (int)npath_list.size()) { + std::vector sub_npath_list( + npath_list.begin(), npath_list.begin() + part_num); + npath_str = boost::algorithm::join(sub_npath_list, delim); + } else { + npath_str = boost::algorithm::join(npath_list, delim); + } + + if (ntypes.size() == 0) { + VLOG(0) << "node_type not specified, nothing will be loaded "; return 0; + } else { + for (size_t i = 0; i < ntypes.size(); i++) { + if (feature_to_id.find(ntypes[i]) == feature_to_id.end()) { + VLOG(0) << "node_type " << ntypes[i] + << "is not defined, will not load"; + return 0; + } + } } + this->load_nodes(npath_str, ""); } - } - this->load_nodes(npath_str, ""); - } - return 0; - })); + return 0; + })); } for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get(); return 0; @@ -1121,10 +1127,11 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype, std::string ntype int32_t GraphTable::get_nodes_ids_by_ranges( int type_id, int idx, std::vector> ranges, std::vector &res) { + std::mutex mutex; int start = 0, end, index = 0, total_size = 0; res.clear(); auto &shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; - std::vector>> tasks; + std::vector> tasks; for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) { end = total_size + shards[i]->get_size(); start = total_size; @@ -1140,41 +1147,56 @@ int32_t GraphTable::get_nodes_ids_by_ranges( first -= total_size; second -= total_size; tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( - [&shards, this, first, second, i]() -> std::vector { - return shards[i]->get_ids_by_range(first, second); + [&shards, this, first, second, i, &res, &mutex]() -> size_t { + std::vector keys; + shards[i]->get_ids_by_range(first, second, &keys); + + size_t num = keys.size(); + mutex.lock(); + res.reserve(res.size() + num); + for (auto &id : keys) { + res.push_back(id); + std::swap(res[rand() % res.size()], res[(int)res.size() - 1]); + } + mutex.unlock(); + + return num; })); } } total_size += shards[i]->get_size(); } for (size_t i = 0; i < tasks.size(); i++) { - auto vec = tasks[i].get(); - for (auto &id : vec) { - res.push_back(id); - std::swap(res[rand() % res.size()], res[(int)res.size() - 1]); - } + tasks[i].get(); } return 0; } -std::pair GraphTable::parse_node_file(const std::string &path, const std::string &node_type, int idx) { +std::pair GraphTable::parse_node_file( + const std::string &path, const std::string &node_type, int idx) { std::ifstream file(path); std::string line; uint64_t local_count = 0; uint64_t local_valid_count = 0; + + int num = 0; + std::vector vals; + size_t n = node_type.length(); while (std::getline(file, line)) { - size_t start = line.find_first_of('\t'); - if (start == std::string::npos) continue; - std::string parse_node_type = line.substr(0, start); - if (parse_node_type != node_type) { + if (strncmp(line.c_str(), node_type.c_str(), n) != 0) { + continue; + } + vals.clear(); + num = paddle::string::split_string_ptr(line.c_str() + n + 1, + line.length() - n - 1, '\t', &vals); + if (num == 0) { continue; } - size_t end = line.find_first_of('\t', start + 1); - uint64_t id = std::stoull(line.substr(start +1, end - start - 1)); + uint64_t id = std::strtoul(vals[0].ptr, NULL, 10); size_t shard_id = id % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { VLOG(4) << "will not load " << id << " from " << path - << ", please check id distribution"; + << ", please check id distribution"; continue; } local_count++; @@ -1183,20 +1205,20 @@ std::pair GraphTable::parse_node_file(const std::string &pat auto node = feature_shards[idx][index]->add_feature_node(id, false); if (node != NULL) { node->set_feature_size(feat_name[idx].size()); - while (end != std::string::npos) { - start = end; - end = line.find_first_of('\t', start + 1); - std::string tmp_str = line.substr(start + 1, end - start - 1); - parse_feature(idx, tmp_str, node); + for (int i = 1; i < n; ++i) { + auto &v = vals[i]; + parse_feature(idx, v.ptr, v.len, node); } } local_valid_count++; } - VLOG(0) << "node_type[" << node_type << "] loads " << local_count << " nodes from filepath->" << path; + VLOG(2) << "node_type[" << node_type << "] loads " << local_count + << " nodes from filepath->" << path; return {local_count, local_valid_count}; } -std::pair GraphTable::parse_node_file(const std::string &path) { +std::pair GraphTable::parse_node_file( + const std::string &path) { std::ifstream file(path); std::string line; uint64_t local_count = 0; @@ -1206,40 +1228,44 @@ std::pair GraphTable::parse_node_file(const std::string &pat auto path_split = paddle::string::split_string(path, "/"); auto path_name = path_split[path_split.size() - 1]; + int num = 0; + std::vector vals; + while (std::getline(file, line)) { - size_t start = line.find_first_of('\t'); - if (start == std::string::npos) continue; - std::string parse_node_type = line.substr(0, start); + vals.clear(); + num = paddle::string::split_string_ptr(line.c_str(), line.length(), '\t', + &vals); + if (vals.empty()) { + continue; + } + std::string parse_node_type = vals[0].to_string(); auto it = feature_to_id.find(parse_node_type); if (it == feature_to_id.end()) { VLOG(0) << parse_node_type << "type error, please check"; continue; } idx = it->second; - size_t end = line.find_first_of('\t', start + 1); - uint64_t id = std::stoull(line.substr(start +1, end - start - 1)); + uint64_t id = std::strtoul(vals[1].ptr, NULL, 10); size_t shard_id = id % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { VLOG(4) << "will not load " << id << " from " << path - << ", please check id distribution"; - continue; - } + << ", please check id distribution"; + continue; + } local_count++; size_t index = shard_id - shard_start; auto node = feature_shards[idx][index]->add_feature_node(id, false); if (node != NULL) { - while (end != std::string::npos) { - start = end; - end = line.find_first_of('\t', start + 1); - std::string tmp_str = line.substr(start + 1, end - start - 1); - parse_feature(idx, tmp_str, node); + for (int i = 2; i < num; ++i) { + auto &v = vals[i]; + parse_feature(idx, v.ptr, v.len, node); } } - local_valid_count++; } - VLOG(0) << local_valid_count << "/" << local_count << " nodes from filepath->" << path; + VLOG(2) << local_valid_count << "/" << local_count << " nodes from filepath->" + << path; return {local_count, local_valid_count}; } @@ -1256,9 +1282,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { std::vector>> tasks; for (size_t i = 0; i < paths.size(); i++) { tasks.push_back(load_node_edge_task_pool->enqueue( - [&, i, this]() -> std::pair { - return parse_node_file(paths[i]); - })); + [&, i, this]() -> std::pair { + return parse_node_file(paths[i]); + })); } for (int i = 0; i < (int)tasks.size(); i++) { auto res = tasks[i].get(); @@ -1268,8 +1294,8 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { } else { VLOG(0) << "Begin GraphTable::load_nodes() node_type[" << node_type << "]"; if (node_type == "") { - VLOG(0) << "node_type not specified, loading edges to " << id_to_feature[0] - << " part"; + VLOG(0) << "node_type not specified, loading edges to " + << id_to_feature[0] << " part"; } else { if (feature_to_id.find(node_type) == feature_to_id.end()) { VLOG(0) << "node_type " << node_type @@ -1285,9 +1311,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { valid_count += res.second; } } - + VLOG(0) << valid_count << "/" << count << " nodes in node_type[ " << node_type - << "] are loaded successfully!"; + << "] are loaded successfully!"; return 0; } @@ -1301,9 +1327,10 @@ int32_t GraphTable::build_sampler(int idx, std::string sample_type) { return 0; } -std::pair GraphTable::parse_edge_file(const std::string &path, int idx, bool reverse) { +std::pair GraphTable::parse_edge_file( + const std::string &path, int idx, bool reverse) { std::string sample_type = "random"; - bool is_weighted = false; + bool is_weighted = false; std::ifstream file(path); std::string line; uint64_t local_count = 0; @@ -1311,16 +1338,17 @@ std::pair GraphTable::parse_edge_file(const std::string &pat uint64_t part_num = 0; if (FLAGS_graph_load_in_parallel) { auto path_split = paddle::string::split_string(path, "/"); - auto part_name_split = paddle::string::split_string(path_split[path_split.size() - 1], "-"); + auto part_name_split = paddle::string::split_string( + path_split[path_split.size() - 1], "-"); part_num = std::stoull(part_name_split[part_name_split.size() - 1]); } - + while (std::getline(file, line)) { size_t start = line.find_first_of('\t'); if (start == std::string::npos) continue; local_count++; - uint64_t src_id = std::stoull(line.substr(0, start)); - uint64_t dst_id = std::stoull(line.substr(start + 1)); + uint64_t src_id = std::stoull(&line[0]); + uint64_t dst_id = std::stoull(&line[start + 1]); if (reverse) { std::swap(src_id, dst_id); } @@ -1330,18 +1358,18 @@ std::pair GraphTable::parse_edge_file(const std::string &pat continue; } } - + float weight = 1; size_t last = line.find_last_of('\t'); if (start != last) { - weight = std::stof(line.substr(last + 1)); + weight = std::stof(&line[last + 1]); sample_type = "weighted"; is_weighted = true; } if (src_shard_id >= shard_end || src_shard_id < shard_start) { VLOG(4) << "will not load " << src_id << " from " << path - << ", please check id distribution"; + << ", please check id distribution"; continue; } size_t index = src_shard_id - shard_start; @@ -1350,13 +1378,11 @@ std::pair GraphTable::parse_edge_file(const std::string &pat node->build_edges(is_weighted); node->add_edge(dst_id, weight); } - - local_valid_count++; + local_valid_count++; } - VLOG(0) << local_count << " edges are loaded from filepath->" << path; + VLOG(2) << local_count << " edges are loaded from filepath->" << path; return {local_count, local_valid_count}; - } int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge, @@ -1381,15 +1407,15 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge, auto paths = paddle::string::split_string(path, ";"); uint64_t count = 0; uint64_t valid_count = 0; - + VLOG(0) << "Begin GraphTable::load_edges() edge_type[" << edge_type << "]"; if (FLAGS_graph_load_in_parallel) { std::vector>> tasks; for (int i = 0; i < paths.size(); i++) { - tasks.push_back(load_node_edge_task_pool->enqueue( + tasks.push_back(load_node_edge_task_pool->enqueue( [&, i, idx, this]() -> std::pair { - return parse_edge_file(paths[i], idx, reverse_edge); - })); + return parse_edge_file(paths[i], idx, reverse_edge); + })); } for (int j = 0; j < (int)tasks.size(); j++) { auto res = tasks[j].get(); @@ -1403,7 +1429,8 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge, valid_count += res.second; } } - VLOG(0) << valid_count << "/" << count << " edge_type[" << edge_type << "] edges are loaded successfully"; + VLOG(0) << valid_count << "/" << count << " edge_type[" << edge_type + << "] edges are loaded successfully"; #ifdef PADDLE_WITH_HETERPS if (search_level == 2) { @@ -1433,7 +1460,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge, #endif return 0; } - + Node *GraphTable::find_node(int type_id, uint64_t id) { size_t shard_id = id % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { @@ -1442,12 +1469,12 @@ Node *GraphTable::find_node(int type_id, uint64_t id) { Node *node = nullptr; size_t index = shard_id - shard_start; auto &search_shards = type_id == 0 ? edge_shards : feature_shards; - for (auto& search_shard: search_shards) { - PADDLE_ENFORCE_NOT_NULL(search_shard[index]); - node = search_shard[index]->find_node(id); - if (node != nullptr) { - break; - } + for (auto &search_shard : search_shards) { + PADDLE_ENFORCE_NOT_NULL(search_shard[index]); + node = search_shard[index]->find_node(id); + if (node != nullptr) { + break; + } } return node; } @@ -1711,9 +1738,11 @@ int32_t GraphTable::set_node_feat( } void string_vector_2_string(std::vector::iterator strs_begin, - std::vector::iterator strs_end, char delim, std::string* output) { + std::vector::iterator strs_end, + char delim, std::string *output) { size_t i = 0; - for (std::vector::iterator iter = strs_begin; iter != strs_end; ++iter) { + for (std::vector::iterator iter = strs_begin; iter != strs_end; + ++iter) { if (i > 0) { *output += delim; } @@ -1723,161 +1752,218 @@ void string_vector_2_string(std::vector::iterator strs_begin, } } -int GraphTable::parse_feature(int idx, const std::string& feat_str, - FeatureNode* node) { +void string_vector_2_string( + std::vector::iterator strs_begin, + std::vector::iterator strs_end, char delim, + std::string *output) { + size_t i = 0; + for (auto iter = strs_begin; iter != strs_end; ++iter) { + if (i > 0) { + output->append(&delim, 1); + } + output->append((*iter).ptr, (*iter).len); + ++i; + } +} + +int GraphTable::parse_feature(int idx, const char *feat_str, size_t len, + FeatureNode *node) { // Return (feat_id, btyes) if name are in this->feat_name, else return (-1, // "") - std::vector fields = - paddle::string::split_string(feat_str, feature_separator_); + thread_local std::vector fields; + fields.clear(); + const char c = feature_separator_.at(0); + paddle::string::split_string_ptr(feat_str, len, c, &fields); - auto it = feat_id_map[idx].find(fields[0]); + std::string name = fields[0].to_string(); + auto it = feat_id_map[idx].find(name); if (it != feat_id_map[idx].end()) { int32_t id = it->second; - std::string* fea_ptr = node->mutable_feature(id); + std::string *fea_ptr = node->mutable_feature(id); std::string dtype = this->feat_dtype[idx][id]; if (dtype == "feasign") { - string_vector_2_string(fields.begin() + 1, fields.end(), ' ', fea_ptr); + // string_vector_2_string(fields.begin() + 1, fields.end(), ' ', + // fea_ptr); + FeatureNode::parse_value_to_bytes(fields.begin() + 1, + fields.end(), fea_ptr); return 0; } else if (dtype == "string") { string_vector_2_string(fields.begin() + 1, fields.end(), ' ', fea_ptr); return 0; } else if (dtype == "float32") { - FeatureNode::parse_value_to_bytes(fields.begin() + 1, fields.end(), fea_ptr); + FeatureNode::parse_value_to_bytes(fields.begin() + 1, fields.end(), + fea_ptr); return 0; } else if (dtype == "float64") { - FeatureNode::parse_value_to_bytes(fields.begin() + 1, fields.end(), fea_ptr); + FeatureNode::parse_value_to_bytes(fields.begin() + 1, + fields.end(), fea_ptr); return 0; } else if (dtype == "int32") { - FeatureNode::parse_value_to_bytes(fields.begin() + 1, fields.end(), fea_ptr); + FeatureNode::parse_value_to_bytes(fields.begin() + 1, + fields.end(), fea_ptr); return 0; } else if (dtype == "int64") { - FeatureNode::parse_value_to_bytes(fields.begin() + 1, fields.end(), fea_ptr); + FeatureNode::parse_value_to_bytes(fields.begin() + 1, + fields.end(), fea_ptr); return 0; - } + } } else { - VLOG(2) << "feature_name[" << fields[0] - << "] is not in feat_id_map, ntype_id[" << idx - << "] feat_id_map_size[" << feat_id_map.size() << "]"; + VLOG(2) << "feature_name[" << name << "] is not in feat_id_map, ntype_id[" + << idx << "] feat_id_map_size[" << feat_id_map.size() << "]"; } return -1; } - -int GraphTable::get_all_id(int type_id, int slice_num, std::vector> *output) { - output->resize(slice_num); +// thread safe shard vector merge +class MergeShardVector { + public: + MergeShardVector(std::vector> *output, int slice_num) { + _slice_num = slice_num; + _shard_keys = output; + _shard_keys->resize(slice_num); + _mutexs = new std::mutex[slice_num]; + } + ~MergeShardVector() { + if (_mutexs != nullptr) { + delete[] _mutexs; + _mutexs = nullptr; + } + } + // merge shard keys + void merge(const std::vector> &shard_keys) { + // add to shard + for (int shard_id = 0; shard_id < _slice_num; ++shard_id) { + auto &dest = (*_shard_keys)[shard_id]; + auto &src = shard_keys[shard_id]; + + _mutexs[shard_id].lock(); + dest.insert(dest.end(), src.begin(), src.end()); + _mutexs[shard_id].unlock(); + } + } + + private: + int _slice_num = 0; + std::mutex *_mutexs = nullptr; + std::vector> *_shard_keys; +}; + +int GraphTable::get_all_id(int type_id, int slice_num, + std::vector> *output) { + MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards : feature_shards; - std::vector>> tasks; + std::vector> tasks; for (int idx = 0; idx < search_shards.size(); idx++) { for (int j = 0; j < search_shards[idx].size(); j++) { tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue( - [&search_shards, idx, j]() -> std::vector { - return search_shards[idx][j]->get_all_id(); - })); + [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t { + std::vector> shard_keys; + size_t num = + search_shards[idx][j]->get_all_id(&shard_keys, slice_num); + // add to shard + shard_merge.merge(shard_keys); + return num; + })); } } for (size_t i = 0; i < tasks.size(); ++i) { tasks[i].wait(); } - for (size_t i = 0; i < tasks.size(); i++) { - auto ids = tasks[i].get(); - for (auto &id : ids) { - (*output)[(uint64_t)(id) % slice_num].push_back(id); - } - } return 0; } -int GraphTable::get_all_neighbor_id(int type_id, int slice_num, std::vector> *output) { - output->resize(slice_num); +int GraphTable::get_all_neighbor_id( + int type_id, int slice_num, std::vector> *output) { + MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards : feature_shards; - std::vector>> tasks; + std::vector> tasks; for (int idx = 0; idx < search_shards.size(); idx++) { for (int j = 0; j < search_shards[idx].size(); j++) { tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue( - [&search_shards, idx, j]() -> std::vector { - return search_shards[idx][j]->get_all_neighbor_id(); - })); + [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t { + std::vector> shard_keys; + size_t num = search_shards[idx][j]->get_all_neighbor_id(&shard_keys, + slice_num); + // add to shard + shard_merge.merge(shard_keys); + return num; + })); } } for (size_t i = 0; i < tasks.size(); ++i) { tasks[i].wait(); } - for (size_t i = 0; i < tasks.size(); i++) { - auto ids = tasks[i].get(); - for (auto &id : ids) { - (*output)[(uint64_t)(id) % slice_num].push_back(id); - } - } return 0; } -int GraphTable::get_all_id(int type_id, int idx, - int slice_num, std::vector> *output) { - output->resize(slice_num); +int GraphTable::get_all_id(int type_id, int idx, int slice_num, + std::vector> *output) { + MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; - std::vector>> tasks; - VLOG(0) << "begin task, task_pool_size_[" << task_pool_size_ << "]"; + std::vector> tasks; + VLOG(3) << "begin task, task_pool_size_[" << task_pool_size_ << "]"; for (size_t i = 0; i < search_shards.size(); i++) { tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( - [&search_shards, i]() -> std::vector { - return search_shards[i]->get_all_id(); + [&search_shards, i, slice_num, &shard_merge]() -> size_t { + std::vector> shard_keys; + size_t num = search_shards[i]->get_all_id(&shard_keys, slice_num); + // add to shard + shard_merge.merge(shard_keys); + return num; })); } for (size_t i = 0; i < tasks.size(); ++i) { tasks[i].wait(); } - VLOG(0) << "end task, task_pool_size_[" << task_pool_size_ << "]"; - for (size_t i = 0; i < tasks.size(); i++) { - auto ids = tasks[i].get(); - for (auto &id : ids) (*output)[id % slice_num].push_back(id); - } + VLOG(3) << "end task, task_pool_size_[" << task_pool_size_ << "]"; return 0; } -int GraphTable::get_all_neighbor_id(int type_id, int idx, - int slice_num, std::vector> *output) { - output->resize(slice_num); +int GraphTable::get_all_neighbor_id( + int type_id, int idx, int slice_num, + std::vector> *output) { + MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; - std::vector>> tasks; - VLOG(0) << "begin task, task_pool_size_[" << task_pool_size_ << "]"; + std::vector> tasks; + VLOG(3) << "begin task, task_pool_size_[" << task_pool_size_ << "]"; for (int i = 0; i < search_shards.size(); i++) { tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( - [&search_shards, i]() -> std::vector { - return search_shards[i]->get_all_neighbor_id(); + [&search_shards, i, slice_num, &shard_merge]() -> size_t { + std::vector> shard_keys; + size_t num = + search_shards[i]->get_all_neighbor_id(&shard_keys, slice_num); + // add to shard + shard_merge.merge(shard_keys); + return num; })); } for (size_t i = 0; i < tasks.size(); ++i) { tasks[i].wait(); } - VLOG(0) << "end task, task_pool_size_[" << task_pool_size_ << "]"; - for (size_t i = 0; i < tasks.size(); i++) { - auto ids = tasks[i].get(); - for (auto &id : ids) (*output)[id % slice_num].push_back(id); - } + VLOG(3) << "end task, task_pool_size_[" << task_pool_size_ << "]"; return 0; } -int GraphTable::get_all_feature_ids(int type_id, int idx, int slice_num, - std::vector>* output) { - output->resize(slice_num); +int GraphTable::get_all_feature_ids( + int type_id, int idx, int slice_num, + std::vector> *output) { + MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; - std::vector>> tasks; + std::vector> tasks; for (int i = 0; i < search_shards.size(); i++) { - tasks.push_back( - _shards_task_pool[i % task_pool_size_]->enqueue( - [&search_shards, i]() -> std::set { - return search_shards[i]->get_all_feature_ids(); - } - ) - ); + tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( + [&search_shards, i, slice_num, &shard_merge]() -> size_t { + std::vector> shard_keys; + size_t num = + search_shards[i]->get_all_feature_ids(&shard_keys, slice_num); + // add to shard + shard_merge.merge(shard_keys); + return num; + })); } for (size_t i = 0; i < tasks.size(); ++i) { tasks[i].wait(); } - for (size_t i = 0; i < tasks.size(); i++) { - auto ids = tasks[i].get(); - for (auto &id : ids) (*output)[id % slice_num].push_back(id); - } return 0; } @@ -2011,7 +2097,7 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) { _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0)); } load_node_edge_task_pool.reset(new ::ThreadPool(load_thread_num)); - + auto graph_feature = graph.graph_feature(); auto node_types = graph.node_types(); auto edge_types = graph.edge_types(); diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index 06ea0b4e4b1541..011f5226c67240 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -17,6 +17,7 @@ #include #include #include + #include #include #include @@ -36,6 +37,7 @@ #include #include #include + #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/common_table.h" #include "paddle/fluid/distributed/ps/table/graph/class_macro.h" @@ -56,42 +58,65 @@ class GraphShard { ~GraphShard(); std::vector &get_bucket() { return bucket; } std::vector get_batch(int start, int end, int step); - std::vector get_ids_by_range(int start, int end) { - std::vector res; + void get_ids_by_range(int start, int end, std::vector *res) { + res->reserve(res->size() + end - start); for (int i = start; i < end && i < (int)bucket.size(); i++) { - res.push_back(bucket[i]->get_id()); + res->emplace_back(bucket[i]->get_id()); } - return res; } - std::vector get_all_id() { - std::vector res; - for (int i = 0; i < (int)bucket.size(); i++) { - res.push_back(bucket[i]->get_id()); + size_t get_all_id(std::vector> *shard_keys, + int slice_num) { + int bucket_num = bucket.size(); + shard_keys->resize(slice_num); + for (int i = 0; i < slice_num; ++i) { + (*shard_keys)[i].reserve(bucket_num / slice_num); + } + for (int i = 0; i < bucket_num; i++) { + uint64_t k = bucket[i]->get_id(); + (*shard_keys)[k % slice_num].emplace_back(k); } - return res; + return bucket_num; } - std::vector get_all_neighbor_id() { - std::vector res; - std::unordered_set uset; + size_t get_all_neighbor_id(std::vector> *total_res, + int slice_num) { + std::vector keys; for (size_t i = 0; i < bucket.size(); i++) { size_t neighbor_size = bucket[i]->get_neighbor_size(); + size_t n = keys.size(); + keys.resize(n + neighbor_size); for (size_t j = 0; j < neighbor_size; j++) { - uset.emplace(bucket[i]->get_neighbor_id(j)); - //res.push_back(bucket[i]->get_neighbor_id(j)); + keys[n + j] = bucket[i]->get_neighbor_id(j); } } - res.assign(uset.begin(), uset.end()); - return res; + return dedup2shard_keys(&keys, total_res, slice_num); } - std::set get_all_feature_ids() { - std::set total_res; - std::set res; + size_t get_all_feature_ids(std::vector> *total_res, + int slice_num) { + std::vector keys; for (int i = 0; i < (int)bucket.size(); i++) { - res.clear(); - bucket[i]->get_feature_ids(&res); - total_res.insert(res.begin(), res.end()); + bucket[i]->get_feature_ids(&keys); } - return total_res; + return dedup2shard_keys(&keys, total_res, slice_num); + } + size_t dedup2shard_keys(std::vector *keys, + std::vector> *total_res, int slice_num) { + size_t num = keys->size(); + uint64_t last_key = 0; + // sort key insert to vector + std::sort(keys->begin(), keys->end()); + total_res->resize(slice_num); + for (int shard_id = 0; shard_id < slice_num; ++shard_id) { + (*total_res)[shard_id].reserve(num / slice_num); + } + for (size_t i = 0; i < num; ++i) { + const uint64_t &k = (*keys)[i]; + if (i > 0 && last_key == k) { + continue; + } + last_key = k; + (*total_res)[k % slice_num].push_back(k); + } + return num; } GraphNode *add_graph_node(uint64_t id); GraphNode *add_graph_node(Node *node); @@ -494,26 +519,33 @@ class GraphTable : public Table { const FsClientParameter &fs_config); virtual int32_t Initialize(const GraphParameter &config); int32_t Load(const std::string &path, const std::string ¶m); - - int32_t load_node_and_edge_file(std::string etype, std::string ntype, std::string epath, - std::string npath, int part_num, bool reverse); + + int32_t load_node_and_edge_file(std::string etype, std::string ntype, + std::string epath, std::string npath, + int part_num, bool reverse); std::string get_inverse_etype(std::string &etype); int32_t load_edges(const std::string &path, bool reverse, const std::string &edge_type); - int get_all_id(int type, int slice_num, std::vector> *output); - int get_all_neighbor_id(int type, int slice_num, std::vector> *output); - int get_all_id(int type, int idx, - int slice_num, std::vector> *output); - int get_all_neighbor_id(int type_id, int id, - int slice_num, std::vector> *output); - int get_all_feature_ids(int type, int idx, - int slice_num, std::vector>* output); - int32_t load_nodes(const std::string &path, std::string node_type = std::string()); - std::pair parse_edge_file(const std::string &path, int idx, bool reverse); - std::pair parse_node_file(const std::string &path, const std::string &node_type, int idx); + int get_all_id(int type, int slice_num, + std::vector> *output); + int get_all_neighbor_id(int type, int slice_num, + std::vector> *output); + int get_all_id(int type, int idx, int slice_num, + std::vector> *output); + int get_all_neighbor_id(int type_id, int id, int slice_num, + std::vector> *output); + int get_all_feature_ids(int type, int idx, int slice_num, + std::vector> *output); + int32_t load_nodes(const std::string &path, + std::string node_type = std::string()); + std::pair parse_edge_file(const std::string &path, + int idx, bool reverse); + std::pair parse_node_file(const std::string &path, + const std::string &node_type, + int idx); std::pair parse_node_file(const std::string &path); int32_t add_graph_node(int idx, std::vector &id_list, std::vector &is_weight_list); @@ -549,8 +581,8 @@ class GraphTable : public Table { } virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index); virtual uint32_t get_thread_pool_index(uint64_t node_id); - virtual int parse_feature(int idx, const std::string& feat_str, - FeatureNode* node); + virtual int parse_feature(int idx, const char *feat_str, size_t len, + FeatureNode *node); virtual int32_t get_node_feat(int idx, const std::vector &node_ids, const std::vector &feature_names, @@ -714,4 +746,4 @@ struct hash { return s.idx ^ s.node_key ^ s.sample_size; } }; -} +} // namespace std diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h index 5f567d0c4b4931..d7350ed4194137 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h @@ -16,9 +16,10 @@ #include #include #include +#include #include #include -#include + #include "glog/logging.h" #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h" #include "paddle/fluid/string/string_helper.h" @@ -51,13 +52,11 @@ class Node { virtual void to_buffer(char *buffer, bool need_feature); virtual void recover_from_buffer(char *buffer); virtual std::string get_feature(int idx) { return std::string(""); } - virtual int get_feature_ids(std::set *res) const { - return 0; - } + virtual int get_feature_ids(std::vector *res) const { return 0; } virtual int get_feature_ids(int slot_idx, std::vector *res) const { return 0; } - virtual void set_feature(int idx, const std::string& str) {} + virtual void set_feature(int idx, const std::string &str) {} virtual void set_feature_size(int size) {} virtual int get_feature_size() { return 0; } virtual size_t get_neighbor_size() { return 0; } @@ -106,18 +105,26 @@ class FeatureNode : public Node { } } - virtual int get_feature_ids(std::set *res) const { + virtual int get_feature_ids(std::vector *res) const { PADDLE_ENFORCE_NOT_NULL(res); errno = 0; - for (auto& feature_item: feature) { - const char *feat_str = feature_item.c_str(); - auto fields = paddle::string::split_string(feat_str, " "); - char *head_ptr = NULL; - for (auto &field : fields) { - PADDLE_ENFORCE_EQ(field.empty(), false); - uint64_t feasign = strtoull(field.c_str(), &head_ptr, 10); - PADDLE_ENFORCE_EQ(field.c_str() + field.length(), head_ptr); - res->insert(feasign); + for (auto &feature_item : feature) { + // const char *feat_str = feature_item.c_str(); + // auto fields = paddle::string::split_string(feat_str, + // " "); char *head_ptr = NULL; for (auto &field : fields) { + // PADDLE_ENFORCE_EQ(field.empty(), false); + // uint64_t feasign = strtoull(field.c_str(), &head_ptr, 10); + // PADDLE_ENFORCE_EQ(field.c_str() + field.length(), head_ptr); + // res->insert(feasign); + // } + const uint64_t *feas = (const uint64_t *)(feature_item.c_str()); + size_t num = feature_item.length() / sizeof(uint64_t); + CHECK((feature_item.length() % sizeof(uint64_t)) == 0) + << "bad feature_item: [" << feature_item << "]"; + size_t n = res->size(); + res->resize(n + num); + for (size_t i = 0; i < num; ++i) { + (*res)[n + i] = feas[i]; } } PADDLE_ENFORCE_EQ(errno, 0); @@ -129,28 +136,39 @@ class FeatureNode : public Node { res->clear(); errno = 0; if (slot_idx < (int)this->feature.size()) { - const char *feat_str = this->feature[slot_idx].c_str(); - auto fields = paddle::string::split_string(feat_str, " "); - char *head_ptr = NULL; - for (auto &field : fields) { - PADDLE_ENFORCE_EQ(field.empty(), false); - uint64_t feasign = strtoull(field.c_str(), &head_ptr, 10); - PADDLE_ENFORCE_EQ(field.c_str() + field.length(), head_ptr); - res->push_back(feasign); + // const char *feat_str = this->feature[slot_idx].c_str(); + // auto fields = paddle::string::split_string(feat_str, + // " "); char *head_ptr = NULL; for (auto &field : fields) { + // PADDLE_ENFORCE_EQ(field.empty(), false); + // uint64_t feasign = strtoull(field.c_str(), &head_ptr, 10); + // //PADDLE_ENFORCE_EQ(field.c_str() + field.length(), head_ptr); + // CHECK(field.c_str() + field.length( ) == head_ptr) + // << "field:[" << field << "], head_ptr:[" << head_ptr << "]"; + // res->push_back(feasign); + // } + const std::string &s = this->feature[slot_idx]; + const uint64_t *feas = (const uint64_t *)(s.c_str()); + + size_t num = s.length() / sizeof(uint64_t); + CHECK((s.length() % sizeof(uint64_t)) == 0) + << "bad feature_item: [" << s << "]"; + res->resize(num); + for (size_t i = 0; i < num; ++i) { + (*res)[i] = feas[i]; } } PADDLE_ENFORCE_EQ(errno, 0); return 0; } - virtual std::string* mutable_feature(int idx) { + virtual std::string *mutable_feature(int idx) { if (idx >= (int)this->feature.size()) { this->feature.resize(idx + 1); } return &(this->feature[idx]); } - virtual void set_feature(int idx, const std::string& str) { + virtual void set_feature(int idx, const std::string &str) { if (idx >= (int)this->feature.size()) { this->feature.resize(idx + 1); } @@ -173,9 +191,9 @@ class FeatureNode : public Node { } template - static void parse_value_to_bytes(std::vector::iterator feat_str_begin, - std::vector::iterator feat_str_end, - std::string* output) { + static void parse_value_to_bytes( + std::vector::iterator feat_str_begin, + std::vector::iterator feat_str_end, std::string *output) { T v; size_t feat_str_size = feat_str_end - feat_str_begin; size_t Tsize = sizeof(T) * feat_str_size; @@ -202,7 +220,26 @@ class FeatureNode : public Node { return out; } -protected: + template + static void parse_value_to_bytes( + std::vector::iterator feat_str_begin, + std::vector::iterator feat_str_end, + std::string *output) { + size_t feat_str_size = feat_str_end - feat_str_begin; + size_t Tsize = sizeof(T) * feat_str_size; + size_t num = output->length(); + output->resize(num + Tsize); + + T *fea_ptrs = (T *)(&(*output)[num]); + + thread_local paddle::string::str_ptr_stream ss; + for (size_t i = 0; i < feat_str_size; i++) { + ss.reset(*(feat_str_begin + i)); + ss >> fea_ptrs[i]; + } + } + + protected: std::vector feature; }; diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index ae9ac80eeb4aab..b9459d003f6ca4 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -554,7 +554,7 @@ int GraphDataGenerator::GenerateBatch() { } cudaStreamSynchronize(stream_); - if (!gpu_graph_training_) return total_instance / 2; + if (!gpu_graph_training_) return 1; ins_buf_pair_len_ -= total_instance / 2; if (debug_mode_) { uint64_t h_slot_tensor[slot_num_][total_instance]; @@ -966,12 +966,12 @@ void GraphDataGenerator::SetConfig( window_ = graph_config.window(); once_sample_startid_len_ = graph_config.once_sample_startid_len(); debug_mode_ = graph_config.debug_mode(); - if (debug_mode_) { + gpu_graph_training_ = graph_config.gpu_graph_training(); + if (debug_mode_ || !gpu_graph_training_) { batch_size_ = graph_config.batch_size(); } else { batch_size_ = once_sample_startid_len_; } - gpu_graph_training_ = graph_config.gpu_graph_training(); repeat_time_ = graph_config.sample_times_one_chunk(); buf_size_ = once_sample_startid_len_ * walk_len_ * walk_degree_ * repeat_time_; diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc index e13004eaf1500d..c9abcdde5e1e7a 100644 --- a/paddle/fluid/framework/device_worker.cc +++ b/paddle/fluid/framework/device_worker.cc @@ -262,11 +262,35 @@ void DeviceWorker::DumpField(const Scope& scope, int dump_mode, size_t batch_size = device_reader_->GetCurBatchSize(); auto& ins_id_vec = device_reader_->GetInsIdVec(); auto& ins_content_vec = device_reader_->GetInsContentVec(); - if (ins_id_vec.size() > 0) { + if (dump_mode_ == 3) { + batch_size = std::string::npos; + bool has_valid_batch = false; + for (auto& field : *dump_fields_) { + Variable* var = scope.FindVar(field); + if (var == nullptr) { + VLOG(0) << "Note: field[" << field + << "] cannot be find in scope, so it was skipped."; + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (!tensor->IsInitialized()) { + VLOG(0) << "Note: field[" << field + << "] is not initialized, so it was skipped."; + continue; + } + auto& dims = tensor->dims(); + if (dims.size() == 2 && dims[0] > 0) { + batch_size = std::min(batch_size, static_cast(dims[0])); + // VLOG(0)<<"in dump field ---> "< hit(batch_size, false); std::default_random_engine engine(0); std::uniform_int_distribution dist(0U, INT_MAX); for (size_t i = 0; i < batch_size; i++) { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 956885334fa5f8..4bc9a1b0157ed2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/platform/timer.h" #include "thrust/pair.h" #elif defined(PADDLE_WITH_XPU_KP) -// #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" #include #include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #endif @@ -56,16 +55,20 @@ class HeterComm { HeterComm& operator=(const HeterComm&) = delete; void split_input_to_shard(KeyType* d_keys, int* d_idx_ptr, size_t len, - int* left, int* right, int gpu_num); + int* left, int* right, int gpu_num); void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, - int& uniq_len); // NOLINT + int& uniq_len); // NOLINT void dynamic_merge_grad(int gpu_num, KeyType* d_keys, float* d_grads, - size_t len, int& uniq_len); + size_t len, int& uniq_len, size_t& segment_len, bool enable_segment_merge_grad); + void segment_merge_grad(int gpu_num, KeyType* d_keys, float* d_grads, + const uint32_t* d_index, size_t len, + const uint32_t* d_fea_num_info, + size_t uniq_len, size_t& segment_len); void pull_sparse(int num, KeyType* d_keys, float* d_vals, size_t len); void build_ps(int num, KeyType* h_keys, ValType* h_vals, size_t len, - size_t chunk_size, int stream_num, int offset = -1); + size_t chunk_size, int stream_num, int offset = -1); void build_ps(int num, KeyType* h_keys, char* pool, size_t len, - size_t feature_value_size, size_t chunk_size, int stream_num); + size_t feature_value_size, size_t chunk_size, int stream_num); void dump(); void show_one_table(int gpu_num); void show_table_collisions(); @@ -253,6 +256,8 @@ class HeterComm { int block_size_{256}; std::unique_ptr heter_comm_kernel_; + CommonFeatureValueAccessor feature_value_accessor_; + private: int topo_aware_{0}; std::vector storage_; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 2108c0e23eae62..36bad37d2abe4f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -24,6 +24,8 @@ limitations under the License. */ DECLARE_double(gpugraph_hbm_table_load_factor); DECLARE_bool(gpugraph_enable_gpu_direct_access); +DECLARE_bool(gpugraph_enable_segment_merge_grads); +DECLARE_uint64(gpugraph_merge_grads_segment_size); namespace paddle { namespace framework { @@ -643,15 +645,15 @@ void HeterComm::merge_grad( template -void HeterComm::dynamic_merge_grad( - int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len) { +void HeterComm::dynamic_merge_grad( + int gpu_num, KeyType* d_keys, float* d_grads, size_t len, + int& uniq_len, size_t& segment_len, bool enable_segment_merge_grad) { int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); auto stream = resource_->local_stream(gpu_num, 0); size_t temp_storage_bytes; - size_t grad_dim = max_mf_dim_; auto accessor_wrapper_ptr = GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); @@ -660,16 +662,12 @@ void HeterComm::dynamic_merge_grad( auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); - auto d_merge_grads = memory::Alloc(place, len * grad_value_size); - float* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); - auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1)); uint32_t* d_fea_num_info_ptr = reinterpret_cast(d_fea_num_info->ptr()); uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len]; uint32_t* d_idx = (uint32_t*)&d_index[len]; int* d_merged_size = (int*)&d_idx[len]; - int grid_size = (len - 1) / block_size_ + 1; heter_comm_kernel_->fill_idx(d_idx, len, stream); PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len, @@ -710,14 +708,135 @@ void HeterComm::dynamic_merge_grad( d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len, stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + + if (enable_segment_merge_grad) { + segment_merge_grad( + gpu_num, + d_merge_keys_ptr, d_grads, d_index, len, + d_fea_num_info_ptr, uniq_len, + segment_len); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys, d_merge_keys_ptr, + sizeof(KeyType) * segment_len, + cudaMemcpyDeviceToDevice, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + } else { + auto d_merge_grads = memory::Alloc(place, len * grad_value_size); + float* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); + + heter_comm_kernel_->merge_gradient( + d_keys, d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads, + (char*)d_merge_grads_ptr, uniq_len, grad_dim, grad_value_size, merger_, stream); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_merge_grads_ptr, + grad_value_size * uniq_len, + cudaMemcpyDeviceToDevice, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + } +} + +template +void HeterComm::segment_merge_grad( + int gpu_num, // the device number + KeyType* d_keys, // the sorted keys list, which will be modified after merged + float* d_grads, // the raw grads list, which will be modified after merged + const uint32_t* d_index, // the storage position of d_keys, its length is len. + size_t len, // the number of raw input keys + const uint32_t* d_fea_num_info, // prefix sum array, its length is uniq_len+1 + size_t uniq_len, // the number of unique keys + size_t& segments_num) { // the number of segment merged keys + + int dev_id = resource_->dev_id(gpu_num); + platform::CUDAPlace place = platform::CUDAPlace(dev_id); + platform::CUDADeviceGuard guard(dev_id); + auto stream = resource_->local_stream(gpu_num, 0); + + auto grad_dim = max_mf_dim_; + auto grad_value_size = TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); + + auto d_buffer1 = memory::Alloc(place, sizeof(uint32_t) * len); + auto d_segments = reinterpret_cast(d_buffer1->ptr()); + auto d_buffer2 = memory::Alloc(place, sizeof(uint32_t) * len); + auto d_segments_offset = reinterpret_cast(d_buffer2->ptr()); + auto d_buffer3 = memory::Alloc(place, sizeof(uint32_t) * len); + auto d_segments_fea_num_info = reinterpret_cast(d_buffer3->ptr()); + auto d_buffer4 = memory::Alloc(place, sizeof(uint32_t) * len); + auto d_segments_fea_num_offset = reinterpret_cast(d_buffer4->ptr()); + auto d_buffer5 = memory::Alloc(place, sizeof(uint32_t)); + auto d_segments_num = reinterpret_cast(d_buffer5->ptr()); + CUDA_CHECK(cudaMemsetAsync(d_segments_num, 0, sizeof(uint32_t), stream)); + + uint32_t segment_size = FLAGS_gpugraph_merge_grads_segment_size; + heter_comm_kernel_->split_segments( + d_fea_num_info, uniq_len, + d_segments, + d_segments_num, + segment_size, stream); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + + size_t temp_storage_bytes = 0; + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum( + NULL, temp_storage_bytes, d_segments, d_segments_num, + uniq_len, stream)); + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum( + d_temp_storage->ptr(), temp_storage_bytes, d_segments, d_segments_num, + uniq_len, stream)); + CUDA_CHECK(cudaMemcpyAsync(&segments_num, d_segments_num, sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + + temp_storage_bytes = 0; + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( + NULL, temp_storage_bytes, d_segments, d_segments_offset, + uniq_len, stream)); + if (d_temp_storage->size() < temp_storage_bytes) { + d_temp_storage = NULL; + d_temp_storage = memory::Alloc(place, temp_storage_bytes); + } + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( + d_temp_storage->ptr(), temp_storage_bytes, d_segments, d_segments_offset, + uniq_len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + + heter_comm_kernel_->expand_segments( + d_fea_num_info, + d_segments_offset, uniq_len, + d_segments_fea_num_info, segment_size, stream); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( + NULL, temp_storage_bytes, d_segments_fea_num_info, d_segments_fea_num_offset, + segments_num, stream)); + if (d_temp_storage->size() < temp_storage_bytes) { + d_temp_storage = NULL; + d_temp_storage = memory::Alloc(place, temp_storage_bytes); + } + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( + d_temp_storage->ptr(), temp_storage_bytes, d_segments_fea_num_info, d_segments_fea_num_offset, + segments_num, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + + auto d_segments_keys = memory::Alloc(place, sizeof(KeyType) * segments_num); + auto d_segments_keys_ptr = reinterpret_cast(d_segments_keys->ptr()); + heter_comm_kernel_->shrink_keys( + d_keys, d_segments_fea_num_offset, + d_segments_keys_ptr, segments_num, + stream); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + + auto d_segment_grads = memory::Alloc(place, segments_num * grad_value_size); + auto d_segment_grads_ptr = reinterpret_cast(d_segment_grads->ptr()); heter_comm_kernel_->merge_gradient( - d_keys, d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads, - (char*)d_merge_grads_ptr, uniq_len, grad_dim, grad_value_size, merger_, - stream, feature_value_accessor_); + d_segments_keys_ptr, d_segments_fea_num_offset, d_segments_fea_num_info, d_index, + (char*)d_grads, (char*)d_segment_grads_ptr, segments_num, + grad_dim, grad_value_size, merger_, stream, feature_value_accessor_); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_merge_grads_ptr, - grad_value_size * uniq_len, - cudaMemcpyDeviceToDevice, stream)); + + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys, d_segments_keys_ptr, + sizeof(KeyType) * segments_num, + cudaMemcpyDeviceToDevice, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_segment_grads_ptr, + grad_value_size * segments_num, + cudaMemcpyDeviceToDevice, stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } @@ -741,21 +860,17 @@ void HeterComm::split_input_to_shard( auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int)); int* d_shard_index_tmp_ptr = reinterpret_cast(d_shard_index_tmp->ptr()); - // int grid_size = (len - 1) / block_size_ + 1; - heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream); heter_comm_kernel_->calc_shard_index(d_keys, len, d_shard_index_tmp_ptr, total_device, stream); size_t temp_storage_bytes; const int num_bits = 1 + log2i(total_device); - heter_comm_kernel_->sort_pairs( NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - heter_comm_kernel_->sort_pairs( d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream); @@ -885,6 +1000,7 @@ void HeterComm::pull_sparse( sync_stream(node.out_stream); } } + heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size, stream, feature_value_accessor_); @@ -960,9 +1076,20 @@ void HeterComm::push_sparse( d_shard_grads_ptr = reinterpret_cast(d_shard_grads->ptr()); int uniq_len = len; - dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len); - - int grid_size = (uniq_len - 1) / block_size_ + 1; + size_t segment_len = 0; + if (FLAGS_gpugraph_enable_segment_merge_grads) { + // do two gradient merge + // 1st. do segmented gradient merge + // 2nd. do global gradient merge + dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len, true); + len = segment_len; + uniq_len = 0; + segment_len = 0; + dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len, false); + } else { + // Perform gradient merge only once + dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len, false); + } split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); @@ -1096,8 +1223,6 @@ void HeterComm::push_sparse( int uniq_len = len; merge_grad(dev_num, d_keys, d_grads, len, uniq_len); - // int grid_size = (uniq_len - 1) / block_size_ + 1; - split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); @@ -1274,7 +1399,6 @@ int HeterComm::gather_one_node_grad( cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); - // int grid_size = (h_node_len[i] - 1) / block_size_ + 1; heter_comm_kernel_->fill_shard_grads( storage.local_keys + merge_num, storage.all_keys + index, storage.local_grads + merge_num, storage.all_grads + index, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index 22c997b32fcaf1..5efbcffbf4e267 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -187,6 +187,57 @@ __global__ void merge_gradients_embedx_kernel( } } +__global__ void split_segments_kernel( + const uint32_t* d_fea_num_info, size_t n, + uint32_t* d_segments, uint32_t* d_segments_num, + uint32_t segment_size) { + const size_t tx = blockIdx.x * blockDim.x + threadIdx.x; + if (tx >= n) { + return; + } + + auto fea_num = d_fea_num_info[tx]; + auto seg_num = (uint32_t)((fea_num - 1) / segment_size + 1); + d_segments[tx] = seg_num; +} + +__global__ void expand_segments_kernel( + const uint32_t* d_fea_num_info, + const uint32_t* d_segments_offset, size_t n, + uint32_t* d_segments_fea_num_info, uint32_t segment_size) { + const size_t tx = blockIdx.x * blockDim.x + threadIdx.x; + if (tx >= n) { + return; + } + + auto fea_num = d_fea_num_info[tx]; + auto seg_num = (uint32_t)((fea_num - 1) / segment_size + 1); + auto start_pos = d_segments_offset[tx]; + auto remains = fea_num; + int cur_seg_size = 0; + for (size_t i = 0; i < seg_num; ++i) { + if (remains >= segment_size) { + cur_seg_size = segment_size; + } else { + cur_seg_size = remains; + } + d_segments_fea_num_info[start_pos + i] = cur_seg_size; + remains -= cur_seg_size; + } +} + +template +__global__ void shrink_keys_kernel( + const KeyType* d_keys, const uint32_t* d_segments_offset, + KeyType* d_segments_keys, size_t n) { + const size_t tx = blockIdx.x * blockDim.x + threadIdx.x; + if (tx >= n) { + return; + } + + d_segments_keys[tx] = d_keys[d_segments_offset[tx]]; +} + template __global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals, float* d_vals, T* idx, size_t len, size_t val_size, @@ -335,6 +386,34 @@ void HeterCommKernel::dy_mf_fill_dvals(float* d_shard_vals, float* d_vals, d_shard_vals, d_vals, idx, c_len, val_size, feature_value_accessor); } +template +void HeterCommKernel::split_segments(const uint32_t* d_fea_num_info, size_t n, + uint32_t* d_segments, uint32_t* d_segments_num, size_t segment_size, const StreamType& stream) { + int grid_size = (n - 1) / block_size_ + 1; + split_segments_kernel<<>>( + d_fea_num_info, n, d_segments, d_segments_num, segment_size); +} + +template +void HeterCommKernel::expand_segments(const uint32_t* d_fea_num_info, + const uint32_t* d_segments_offset, size_t n, + uint32_t* d_segments_fea_num_info, uint32_t segment_size, + const StreamType& stream) { + int grid_size = (n - 1) / block_size_ + 1; + expand_segments_kernel<<>>( + d_fea_num_info, + d_segments_offset, n, + d_segments_fea_num_info, segment_size); +} + +template +void HeterCommKernel::shrink_keys(const KeyType* d_keys, const uint32_t* d_segments_offset, + KeyType* d_segments_keys, size_t n, const StreamType& stream) { + int grid_size = (n - 1) / block_size_ + 1; + shrink_keys_kernel<<>>( + d_keys, d_segments_offset, d_segments_keys, n); +} + template void HeterCommKernel::fill_idx( int* idx, long long len, const cudaStream_t& stream); template void HeterCommKernel::fill_idx( @@ -404,21 +483,18 @@ template void HeterCommKernel::dy_mf_fill_shard_grads< float* d_grads, int* idx, long long len, size_t grad_value_size, const cudaStream_t& stream, CommonFeatureValueAccessor& feature_value_accessor); - -template void HeterCommKernel::merge_gradient( - const uint32_t* d_keys, const uint32_t* offset, const uint32_t* fea_num, - const uint32_t* index, const char* input, char* output, int n, - size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_, - const cudaStream_t& stream, +template void HeterCommKernel::merge_gradient( + const uint32_t* d_keys, + const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, + const char* input, char* output, int n, size_t grad_dim, size_t grad_value_size, + DynamicGradMerger& merger_, const cudaStream_t& stream, CommonFeatureValueAccessor& feature_value_accessor); -template void HeterCommKernel::merge_gradient( - const uint64_t* d_keys, const uint32_t* offset, const uint32_t* fea_num, - const uint32_t* index, const char* input, char* output, int n, - size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_, - const cudaStream_t& stream, +template void HeterCommKernel::merge_gradient( + const uint64_t* d_keys, + const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, + const char* input, char* output, int n, size_t grad_dim, size_t grad_value_size, + DynamicGradMerger& merger_, const cudaStream_t& stream, CommonFeatureValueAccessor& feature_value_accessor); template void HeterCommKernel::dy_mf_fill_dvals( + const uint32_t* d_fea_num_info, size_t n, + uint32_t* d_segment, uint32_t* d_segments_num, size_t segment_size, + const cudaStream_t& stream); + +template void HeterCommKernel::expand_segments( + const uint32_t* d_fea_num_info, + const uint32_t* d_segments_offset, size_t n, + uint32_t* d_segments_fea_num_info, uint32_t segment_size, + const cudaStream_t& stream); + +template void HeterCommKernel::shrink_keys( + const uint32_t* d_keys, const uint32_t* d_segments_offset, + uint32_t* d_segments_keys, size_t segment_num, const cudaStream_t& stream); + +template void HeterCommKernel::shrink_keys( + const uint64_t* d_keys, const uint32_t* d_segments, + uint64_t* d_segments_keys, size_t total_segment_num, const cudaStream_t& stream); #endif } // namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index 5dc11b86ab63d2..35abbb05cc9ee8 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -164,7 +164,22 @@ class HeterCommKernel { const StreamType& stream, FVAccessor& feature_value_accessor); - // CommonFeatureValueAccessor feature_value_accessor_; + template + void split_segments(const uint32_t* d_fea_num_info, + size_t len, uint32_t* d_segments, uint32_t* d_segments_num, + size_t segment_size, const StreamType& stream); + + template + void expand_segments(const uint32_t* d_fea_num_info, + const uint32_t* d_segments_offset, size_t segments_num, + uint32_t* d_segments_fea_num_info, uint32_t segment_size, + const StreamType& stream); + + template + void shrink_keys(const KeyType* d_keys, const uint32_t* d_segments_offset, + KeyType* d_segments_keys, size_t segments_num, const StreamType& stream); + + private: int block_size_{256}; }; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index dc9619e39c60dd..e48beb176a1cad 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -246,12 +246,12 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { }; for (int i = 0; i < thread_keys_thread_num_; i++) { if (!multi_mf_dim_) { - VLOG(0) << "psgpu graph wrapper genfunc"; + VLOG(1) << "psgpu graph wrapper genfunc"; threads.push_back( std::thread(gen_graph_data_func, std::ref(vec_data), begin, begin + len_per_thread + (i < remain ? 1 : 0), i)); } else { - VLOG(0) << "psgpu graph wrapper genfunc with dynamic mf"; + VLOG(1) << "psgpu graph wrapper genfunc with dynamic mf"; threads.push_back( std::thread(gen_graph_dynamic_mf_func, std::ref(vec_data), begin, begin + len_per_thread + (i < remain ? 1 : 0), i)); diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index cee122e540f7e1..84bf12ed31a660 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -225,7 +225,7 @@ void HogwildWorker::TrainFiles() { platform::SetXPUDeviceId(thread_id_); #endif - int total_ins_num = 0; + int total_batch_num = 0; // how to accumulate fetched values here device_reader_->Start(); int cur_batch; @@ -255,7 +255,7 @@ void HogwildWorker::TrainFiles() { DumpParam(*thread_scope_, batch_cnt); } - total_ins_num += cur_batch; + total_batch_num += cur_batch; ++batch_cnt; PrintFetchVars(); thread_scope_->DropKids(); @@ -265,7 +265,7 @@ void HogwildWorker::TrainFiles() { } timeline.Pause(); VLOG(0) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec() - << " seconds, ins_num: " << total_ins_num; + << " seconds, batch_num: " << total_batch_num; if (need_dump_field_ || need_dump_param_) { writer_.Flush(); diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index e482020cf97db8..33198c11cc2af5 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -860,6 +860,12 @@ PADDLE_DEFINE_EXPORTED_double( PADDLE_DEFINE_EXPORTED_bool( gpugraph_enable_gpu_direct_access, false, "enable direct access bwtween multi gpu cards, default false"); +PADDLE_DEFINE_EXPORTED_bool( + gpugraph_enable_segment_merge_grads, false, + "enable segment merge gradients while push sparse, default false"); +PADDLE_DEFINE_EXPORTED_uint64( + gpugraph_merge_grads_segment_size, 128, + "segment size with segment gradient merge, default 128"); /** * ProcessGroupNCCL related FLAG diff --git a/paddle/utils/string/string_helper.h b/paddle/utils/string/string_helper.h index e6cb2e90b8fa1a..7290e2afc128c2 100644 --- a/paddle/utils/string/string_helper.h +++ b/paddle/utils/string/string_helper.h @@ -17,6 +17,7 @@ #include #include #include + #include #include #include @@ -220,6 +221,117 @@ std::string join_strings(const Container& strs, return ss.str(); } +struct str_ptr { + const char* ptr; + size_t len; + str_ptr(const char* p, size_t n) : ptr(p), len(n) {} + str_ptr(str_ptr& other) { + ptr = other.ptr; + len = other.len; + } + str_ptr(str_ptr&& other) { + ptr = other.ptr; + len = other.len; + } + size_t find_ptr(const char c) { + for (size_t i = 0; i < len; ++i) { + if (ptr[i] == c) { + return i; + } + } + return -1; + } + std::string to_string(void) { return std::string(ptr, len); } +}; + +struct str_ptr_stream { + char* ptr = NULL; + char* end = NULL; + str_ptr_stream() {} + str_ptr_stream(const str_ptr& p) { reset(p.ptr, p.len); } + void reset(const str_ptr& p) { reset(p.ptr, p.len); } + void reset(const char* p, size_t len) { + ptr = const_cast(p); + end = ptr + len; + } + char* cursor(void) { return ptr; } + char* finish(void) { return end; } + void set_cursor(char* p) { ptr = p; } + bool is_finish(void) { return (ptr == end); } + template + str_ptr_stream& operator>>(T& x) { + *this >> x; + return *this; + } +}; +inline str_ptr_stream& operator>>(str_ptr_stream& ar, float& c) { + char* next = NULL; + c = strtof(ar.cursor(), &next); + ar.set_cursor(std::min(++next, ar.finish())); + return ar; +} +inline str_ptr_stream& operator>>(str_ptr_stream& ar, double& c) { + char* next = NULL; + c = strtod(ar.cursor(), &next); + ar.set_cursor(std::min(++next, ar.finish())); + return ar; +} +inline str_ptr_stream& operator>>(str_ptr_stream& ar, int32_t& c) { + char* next = NULL; + c = strtol(ar.cursor(), &next, 10); + ar.set_cursor(std::min(++next, ar.finish())); + return ar; +} +inline str_ptr_stream& operator>>(str_ptr_stream& ar, uint32_t& c) { + char* next = NULL; + c = strtoul(ar.cursor(), &next, 10); + ar.set_cursor(std::min(++next, ar.finish())); + return ar; +} +inline str_ptr_stream& operator>>(str_ptr_stream& ar, uint64_t& c) { + char* next = NULL; + c = strtoul(ar.cursor(), &next, 10); + ar.set_cursor(std::min(++next, ar.finish())); + return ar; +} +inline str_ptr_stream& operator>>(str_ptr_stream& ar, int64_t& c) { + char* next = NULL; + c = strtoll(ar.cursor(), &next, 10); + ar.set_cursor(std::min(++next, ar.finish())); + return ar; +} +inline int split_string_ptr(const char* str, + size_t len, + char delim, + std::vector* values) { + if (len <= 0) { + return 0; + } + + int num = 0; + const char* p = str; + const char* end = str + len; + const char* last = str; + while (p < end) { + if (*p != delim) { + ++p; + continue; + } + values->emplace_back(last, (size_t)(p - last)); + ++num; + ++p; + // skip continue delim + while (*p == delim) { + ++p; + } + last = p; + } + if (p > last) { + values->emplace_back(last, (size_t)(p - last)); + ++num; + } + return num; +} // A helper class for reading lines from file. A line buffer is maintained. It // doesn't need to know the maximum possible length of a line. From a8f9e905ecef31ba472ee7486309d8dc6d8abc7b Mon Sep 17 00:00:00 2001 From: danleifeng Date: Thu, 7 Jul 2022 07:32:27 +0000 Subject: [PATCH 06/12] format; test=develop --- cmake/cuda.cmake | 3 +- .../framework/fleet/heter_ps/feature_value.cu | 145 +++++++++++------- .../framework/fleet/heter_ps/feature_value.h | 65 ++++---- .../framework/fleet/heter_ps/heter_comm.h | 7 - .../framework/fleet/heter_ps/heter_comm_inl.h | 41 +---- .../fleet/heter_ps/heter_comm_kernel.cu | 43 ------ .../fleet/heter_ps/heter_comm_kernel.h | 34 ++-- 7 files changed, 140 insertions(+), 198 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 5c6bf86811e64c..4894d615c2a353 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -219,8 +219,7 @@ add_definitions("-DCUDA_VERSION_MINOR=\"${CUDA_VERSION_MINOR}\"") add_definitions("-DCUDA_TOOLKIT_ROOT_DIR=\"${CUDA_TOOLKIT_ROOT_DIR}\"") # setting nvcc arch flags -#select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) -set(NVCC_FLAGS_EXTRA "-gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") +select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}") message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}") diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu index eff345fe44caa8..560ce33b9af78d 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -14,15 +14,18 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" - namespace paddle { namespace framework { - template -__global__ void PullCopy(float** dest, const float* src, - const int64_t* len, int slot_num, int total_len, - uint64_t** keys, uint64_t max_val_size, int* gpu_dim, +__global__ void PullCopy(float** dest, + const float* src, + const int64_t* len, + int slot_num, + int total_len, + uint64_t** keys, + uint64_t max_val_size, + int* gpu_dim, FVAccessor feature_value_accessor) { CUDA_KERNEL_LOOP(i, total_len) { int low = 0; @@ -39,14 +42,20 @@ __global__ void PullCopy(float** dest, const float* src, float* feature_value_ptr = (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); int mf_dim = gpu_dim[x] - 3; - feature_value_accessor.Select(dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); + feature_value_accessor.Select( + dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); } } template -__global__ void PushCopyWithPool(float* dest, float** src, - int64_t* len, int slot_num, uint64_t total_len, - int bs, int* slot_vector, int* mf_dim_vector, +__global__ void PushCopyWithPool(float* dest, + float** src, + int64_t* len, + int slot_num, + uint64_t total_len, + int bs, + int* slot_vector, + int* mf_dim_vector, size_t grad_value_size, FVAccessor feature_value_accessor) { CUDA_KERNEL_LOOP(i, total_len) { @@ -61,58 +70,71 @@ __global__ void PushCopyWithPool(float* dest, float** src, } int x = low; int y = i - (x ? len[low - 1] : 0); - float* cur = - (float*)((char*)dest + i * grad_value_size); + float* cur = (float*)((char*)dest + i * grad_value_size); - cur[feature_value_accessor.common_push_value.SlotIndex()] = + cur[feature_value_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x]; int mf_dim = mf_dim_vector[x]; cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim; - cur[feature_value_accessor.common_push_value.ShowIndex()] = - *(src[x] + y * (mf_dim + 3)); - cur[feature_value_accessor.common_push_value.ClickIndex()] = - *(src[x] + y * (mf_dim + 3) + 1); - cur[feature_value_accessor.common_push_value.EmbedGIndex()] = - *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; + cur[feature_value_accessor.common_push_value.ShowIndex()] = + *(src[x] + y * (mf_dim + 3)); + cur[feature_value_accessor.common_push_value.ClickIndex()] = + *(src[x] + y * (mf_dim + 3) + 1); + cur[feature_value_accessor.common_push_value.EmbedGIndex()] = + *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; for (int j = 0; j < mf_dim; j++) { - cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; + cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = + *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; } } } template -void AccessorWrapper::CopyForPullImpl(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const float* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, - const int64_t total_length, - int* gpu_dim, - int feature_value_size) { +void AccessorWrapper::CopyForPullImpl( + const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, + const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size) { auto stream = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); float** gpu_values = reinterpret_cast(buf_value->ptr()); - cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), - cudaMemcpyHostToDevice); + cudaMemcpy(gpu_values, + values.data(), + values.size() * sizeof(float*), + cudaMemcpyHostToDevice); PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - gpu_values, total_values_gpu, gpu_len, slot_num, total_length, gpu_keys, - feature_value_size, gpu_dim, gpu_accessor_); + gpu_values, + total_values_gpu, + gpu_len, + slot_num, + total_length, + gpu_keys, + feature_value_size, + gpu_dim, + gpu_accessor_); cudaStreamSynchronize(stream); } template -void AccessorWrapper::CopyForPushImpl(const paddle::platform::Place& place, - const std::vector& grad_values, - float* total_grad_values_gpu, - const std::vector& slot_lengths, - const uint64_t total_length, - const int batch_size, size_t grad_value_size, - std::vector& slot_vector, - std::vector& slot_mf_dim_vector) { +void AccessorWrapper::CopyForPushImpl( + const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) { auto stream = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -131,18 +153,33 @@ void AccessorWrapper::CopyForPushImpl(const paddle::platform::Place int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); - cudaMemcpy(gpu_values, grad_values.data(), - grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_slot_vector, slot_vector.data(), - slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector.data(), - slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_values, + grad_values.data(), + grad_values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, + slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), + cudaMemcpyHostToDevice); + cudaMemcpy(d_slot_vector, + slot_vector.data(), + slot_lengths_lod.size() * sizeof(int), + cudaMemcpyHostToDevice); + cudaMemcpy(d_mf_dim_vector, + slot_mf_dim_vector.data(), + slot_lengths_lod.size() * sizeof(int), + cudaMemcpyHostToDevice); PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(), - total_length, batch_size, d_slot_vector, d_mf_dim_vector, - grad_value_size, gpu_accessor_); + total_grad_values_gpu, + gpu_values, + gpu_len, + slot_lengths.size(), + total_length, + batch_size, + d_slot_vector, + d_mf_dim_vector, + grad_value_size, + gpu_accessor_); cudaStreamSynchronize(stream); } @@ -150,6 +187,6 @@ void AccessorWrapper::CopyForPushImpl(const paddle::platform::Place template class AccessorWrapper; #endif -} -} -#endif \ No newline at end of file +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index f5c19fc87b835b..4959a3e1d16ab3 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -293,7 +293,8 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { float* gpu_val, void* cpu, paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) { #ifdef PADDLE_WITH_PSCORE - paddle::distributed::CtrDymfAccessor* cpu_accessor = dynamic_cast(cpu_table_accessor); + paddle::distributed::CtrDymfAccessor* cpu_accessor = + dynamic_cast(cpu_table_accessor); paddle::distributed::FixedFeatureValue* cpu_ptr = (paddle::distributed::FixedFeatureValue*)(cpu); float* cpu_val = cpu_ptr->data(); @@ -311,24 +312,21 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()]; for (int i = 0; i < common_feature_value.EmbedDim(); i++) { gpu_val[common_feature_value.EmbedG2SumIndex() + i] = - cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + - i]; + cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i]; } *(reinterpret_cast( gpu_val + common_feature_value.CpuPtrIndex())) = (uint64_t)(cpu); - cpu_val[cpu_accessor->common_feature_value.MfDimIndex()] = - float(mf_dim); + cpu_val[cpu_accessor->common_feature_value.MfDimIndex()] = float(mf_dim); gpu_val[common_feature_value.MfDimIndex()] = mf_dim; - if (cpu_dim > - cpu_accessor->GetAccessorInfo().dim - - cpu_accessor->GetAccessorInfo().mf_size / sizeof(float)) { + if (cpu_dim > cpu_accessor->GetAccessorInfo().dim - + cpu_accessor->GetAccessorInfo().mf_size / sizeof(float)) { gpu_val[common_feature_value.MfSizeIndex()] = common_feature_value.MFSize(mf_dim) / sizeof(float); for (int x = 0; x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); x++) { - gpu_val[common_feature_value.EmbedxG2SumIndex() + x] = cpu_val - [cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x]; + gpu_val[common_feature_value.EmbedxG2SumIndex() + x] = + cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x]; } } else { gpu_val[common_feature_value.MfSizeIndex()] = 0; @@ -341,23 +339,22 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { } // dump_to_cpu阶段从gpu_val赋值给cpu_val - __host__ void DumpFill( - float* gpu_val, paddle::distributed::ValueAccessor* cpu_table_accessor, - int mf_dim) { + __host__ void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { #ifdef PADDLE_WITH_PSCORE - paddle::distributed::CtrDymfAccessor* cpu_accessor = dynamic_cast(cpu_table_accessor); + paddle::distributed::CtrDymfAccessor* cpu_accessor = + dynamic_cast(cpu_table_accessor); auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*(reinterpret_cast( gpu_val + common_feature_value.CpuPtrIndex()))); size_t downpour_value_size = downpour_value->size(); if (gpu_val[common_feature_value.MfSizeIndex()] > 0 && - downpour_value_size == - (cpu_accessor->GetAccessorInfo().dim - - int(cpu_accessor->GetAccessorInfo().mf_size / - sizeof(float)))) { // cpu_accessor - downpour_value->resize( - cpu_accessor->common_feature_value.Dim(mf_dim)); + downpour_value_size == (cpu_accessor->GetAccessorInfo().dim - + int(cpu_accessor->GetAccessorInfo().mf_size / + sizeof(float)))) { // cpu_accessor + downpour_value->resize(cpu_accessor->common_feature_value.Dim(mf_dim)); } float* cpu_val = downpour_value->data(); cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()] = @@ -379,8 +376,8 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { if (gpu_val[common_feature_value.MfSizeIndex()] > 0) { for (int x = 0; x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); x++) { - cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + - x] = gpu_val[common_feature_value.EmbedxG2SumIndex() + x]; + cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x] = + gpu_val[common_feature_value.EmbedxG2SumIndex() + x]; } } #endif @@ -630,13 +627,13 @@ class VirtualAccessor { virtual size_t GetPushValueSize(int& mf_dim) = 0; - virtual void BuildFill( - void* gpu_val, void* cpu_val, - paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) = 0; + virtual void BuildFill(void* gpu_val, void* cpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) = 0; - virtual void DumpFill( - float* gpu_val, paddle::distributed::ValueAccessor* cpu_table_accessor, - int mf_dim) = 0; + virtual void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) = 0; virtual void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, @@ -678,16 +675,16 @@ class AccessorWrapper : public VirtualAccessor { return gpu_accessor_.common_push_value.Size(mf_dim); } - virtual void BuildFill( - void* gpu_val, void* cpu_val, - paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) { + virtual void BuildFill(void* gpu_val, void* cpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { gpu_accessor_.BuildFill((float*)(gpu_val), cpu_val, cpu_table_accessor, mf_dim); } - virtual void DumpFill( - float* gpu_val, paddle::distributed::ValueAccessor* cpu_table_accessor, - int mf_dim) { + virtual void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { gpu_accessor_.DumpFill(gpu_val, cpu_table_accessor, mf_dim); } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 4bc9a1b0157ed2..dfe6b47c8a5104 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -48,8 +48,6 @@ template resource); - HeterComm(size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor& accessor); virtual ~HeterComm(); HeterComm(const HeterComm&) = delete; HeterComm& operator=(const HeterComm&) = delete; @@ -121,9 +119,6 @@ class HeterComm { void set_accessor(FVAccessor& accessor) { feature_value_accessor_ = accessor; - // for (auto& ptr_table: ptr_tables_) { - // ptr_table->set_accessor(feature_value_accessor_); - // } } #endif @@ -256,8 +251,6 @@ class HeterComm { int block_size_{256}; std::unique_ptr heter_comm_kernel_; - CommonFeatureValueAccessor feature_value_accessor_; - private: int topo_aware_{0}; std::vector storage_; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 36bad37d2abe4f..c9ac11648685ca 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -29,36 +29,6 @@ DECLARE_uint64(gpugraph_merge_grads_segment_size); namespace paddle { namespace framework { -// template -// HeterComm::HeterComm( -// size_t capacity, std::shared_ptr resource) { -// VLOG(1) << "Construct new HeterComm"; -// resource_ = resource; -// storage_.resize(resource_->total_device()); -// multi_mf_dim_ = resource->multi_mf(); -// load_factor_ = FLAGS_gpugraph_hbm_table_load_factor; -// VLOG(0) << "load_factor = " << load_factor_; -// for (int i = 0; i < resource_->total_device(); ++i) { -// #if defined(PADDLE_WITH_CUDA) -// platform::CUDADeviceGuard guard(resource_->dev_id(i)); -// allocators_.push_back(std::make_shared( -// 8, 1, (unsigned int)-1, (size_t)-1, false, false)); // NOLINT -// #endif -// if (!multi_mf_dim_) { -// auto table = new Table(capacity / load_factor_); -// tables_.push_back(table); -// } else { -// VLOG(0) << "Error:use HeterComm Construct with accessor"; -// return; -// } -// if (multi_node_) { -// storage_[i].init(feanum_, resource_->dev_id(i)); -// } -// } -// heter_comm_kernel_ = std::make_unique(block_size_); -// init_path(); -// } template @@ -98,8 +68,6 @@ HeterComm::HeterComm( storage_[i].init(feanum_, resource_->dev_id(i)); } } - // heter_comm_kernel_ = std::make_unique(block_size_, - // feature_value_accessor_); heter_comm_kernel_ = std::make_unique(block_size_); init_path(); } @@ -645,7 +613,7 @@ void HeterComm::merge_grad( template -void HeterComm::dynamic_merge_grad( +void HeterComm::dynamic_merge_grad( int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len, size_t& segment_len, bool enable_segment_merge_grad) { int dev_id = resource_->dev_id(gpu_num); @@ -725,7 +693,7 @@ void HeterComm::dynamic_merge_grad( heter_comm_kernel_->merge_gradient( d_keys, d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads, - (char*)d_merge_grads_ptr, uniq_len, grad_dim, grad_value_size, merger_, stream); + (char*)d_merge_grads_ptr, uniq_len, grad_dim, grad_value_size, merger_, stream, feature_value_accessor_); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len, cudaMemcpyDeviceToDevice, stream)); @@ -733,8 +701,9 @@ void HeterComm::dynamic_merge_grad( } } -template -void HeterComm::segment_merge_grad( +template +void HeterComm::segment_merge_grad( int gpu_num, // the device number KeyType* d_keys, // the sorted keys list, which will be modified after merged float* d_grads, // the raw grads list, which will be modified after merged diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index e325b4399683d1..5efbcffbf4e267 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -414,34 +414,6 @@ void HeterCommKernel::shrink_keys(const KeyType* d_keys, const uint32_t* d_segme d_keys, d_segments_offset, d_segments_keys, n); } -template -void HeterCommKernel::split_segments(const uint32_t* d_fea_num_info, size_t n, - uint32_t* d_segments, uint32_t* d_segments_num, size_t segment_size, const StreamType& stream) { - int grid_size = (n - 1) / block_size_ + 1; - split_segments_kernel<<>>( - d_fea_num_info, n, d_segments, d_segments_num, segment_size); -} - -template -void HeterCommKernel::expand_segments(const uint32_t* d_fea_num_info, - const uint32_t* d_segments_offset, size_t n, - uint32_t* d_segments_fea_num_info, uint32_t segment_size, - const StreamType& stream) { - int grid_size = (n - 1) / block_size_ + 1; - expand_segments_kernel<<>>( - d_fea_num_info, - d_segments_offset, n, - d_segments_fea_num_info, segment_size); -} - -template -void HeterCommKernel::shrink_keys(const KeyType* d_keys, const uint32_t* d_segments_offset, - KeyType* d_segments_keys, size_t n, const StreamType& stream) { - int grid_size = (n - 1) / block_size_ + 1; - shrink_keys_kernel<<>>( - d_keys, d_segments_offset, d_segments_keys, n); -} - template void HeterCommKernel::fill_idx( int* idx, long long len, const cudaStream_t& stream); template void HeterCommKernel::fill_idx( @@ -546,21 +518,6 @@ template void HeterCommKernel::shrink_keys( const uint32_t* d_keys, const uint32_t* d_segments_offset, uint32_t* d_segments_keys, size_t segment_num, const cudaStream_t& stream); -template void HeterCommKernel::split_segments( - const uint32_t* d_fea_num_info, size_t n, - uint32_t* d_segment, uint32_t* d_segments_num, size_t segment_size, - const cudaStream_t& stream); - -template void HeterCommKernel::expand_segments( - const uint32_t* d_fea_num_info, - const uint32_t* d_segments_offset, size_t n, - uint32_t* d_segments_fea_num_info, uint32_t segment_size, - const cudaStream_t& stream); - -template void HeterCommKernel::shrink_keys( - const uint32_t* d_keys, const uint32_t* d_segments_offset, - uint32_t* d_segments_keys, size_t segment_num, const cudaStream_t& stream); - template void HeterCommKernel::shrink_keys( const uint64_t* d_keys, const uint32_t* d_segments, uint64_t* d_segments_keys, size_t total_segment_num, const cudaStream_t& stream); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index 202c3ba5e2ca44..47b4b5acb4b21f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -41,42 +41,48 @@ struct DynamicGradMerger { return out; } + template __device__ __forceinline__ void update_one( float* output, const float* input, - CommonFeatureValueAccessor& feature_value_accessor) { + FVAccessor& feature_value_accessor) { feature_value_accessor.PushValueFill(output, input); } + template __device__ __forceinline__ void merge_one( float* output, const float* input, - CommonFeatureValueAccessor& feature_value_accessor) { + FVAccessor& feature_value_accessor) { feature_value_accessor.MergePushValue(output, input); } + template __device__ __forceinline__ void update_basic( float* output, const float* input, - CommonFeatureValueAccessor& fv_accessor) { + FVAccessor& fv_accessor) { fv_accessor.PushValueFillBasic(output, input); } + template __device__ __forceinline__ void merge_basic( float* output, const float* input, - CommonFeatureValueAccessor& fv_accessor) { + FVAccessor& fv_accessor) { fv_accessor.MergePushValueBasic(output, input); } + template __device__ __forceinline__ void update_embedx( float* output, const float* input, size_t embedx_idx, - CommonFeatureValueAccessor& fv_accessor) { + FVAccessor& fv_accessor) { if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) { output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] = input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx]; } } + template __device__ __forceinline__ void merge_embedx( float* output, const float* input, size_t embedx_idx, - CommonFeatureValueAccessor& fv_accessor) { + FVAccessor& fv_accessor) { if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) { output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] += input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx]; @@ -179,22 +185,6 @@ class HeterCommKernel { void shrink_keys(const KeyType* d_keys, const uint32_t* d_segments_offset, KeyType* d_segments_keys, size_t segments_num, const StreamType& stream); - - template - void split_segments(const uint32_t* d_fea_num_info, - size_t len, uint32_t* d_segments, uint32_t* d_segments_num, - size_t segment_size, const StreamType& stream); - - template - void expand_segments(const uint32_t* d_fea_num_info, - const uint32_t* d_segments_offset, size_t segments_num, - uint32_t* d_segments_fea_num_info, uint32_t segment_size, - const StreamType& stream); - - template - void shrink_keys(const KeyType* d_keys, const uint32_t* d_segments_offset, - KeyType* d_segments_keys, size_t segments_num, const StreamType& stream); - private: int block_size_{256}; }; From c89b64143dd7d58a1863ce562adf4bde4a8332d8 Mon Sep 17 00:00:00 2001 From: danleifeng Date: Thu, 7 Jul 2022 07:42:22 +0000 Subject: [PATCH 07/12] format; test=develop --- .../framework/fleet/heter_ps/CMakeLists.txt | 6 - .../fluid/framework/fleet/ps_gpu_wrapper.cc | 40 ++--- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 143 ------------------ paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 22 --- .../fluid/framework/fleet/ps_gpu_wrapper.kps | 73 --------- 5 files changed, 21 insertions(+), 263 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index fbbb77a205b9a9..e342d4422a5ed3 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -8,14 +8,8 @@ IF(WITH_GPU) SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS}) endif() nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS}) - # nv_library(feature_value SRCS feature_value.h DEPS ${HETERPS_DEPS}) - # nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu DEPS ${HETERPS_DEPS} feature_value) nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS}) - # nv_library(hashtable_kernel SRCS hashtable_kernel.cu DEPS ${HETERPS_DEPS} feature_value) - nv_library(heter_comm SRCS heter_comm.h feature_value.h feature_value.cu heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) - # nv_library(heter_comm SRCS heter_comm.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) - # nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) nv_test(test_heter_comm SRCS DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) if(WITH_PSCORE) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index e48beb176a1cad..c9ff3bded53dc8 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -145,8 +145,8 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { remain = total_len % thread_keys_thread_num_; VLOG(0) << "total len: " << total_len; auto gen_dynamic_mf_func = [this]( - const std::deque& total_data, int begin_index, - int end_index, int i) { + const std::deque& total_data, + int begin_index, int end_index, int i) { for (auto iter = total_data.begin() + begin_index; iter != total_data.begin() + end_index; iter++) { const auto& ins = *iter; @@ -233,17 +233,17 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { this->thread_keys_[i][shard_id].insert(cur_key); } }; - auto gen_graph_dynamic_mf_func = [this]( - const std::vector& total_data, int begin_index, int end_index, - int i) { - for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { - uint64_t cur_key = *iter; - int shard_id = cur_key % thread_keys_shard_num_; - // TODO: feasign <-> slot <-> multi_dim - this->thread_dim_keys_[i][shard_id][0].insert(cur_key); - } - }; + auto gen_graph_dynamic_mf_func = + [this](const std::vector& total_data, int begin_index, + int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + uint64_t cur_key = *iter; + int shard_id = cur_key % thread_keys_shard_num_; + // TODO: feasign <-> slot <-> multi_dim + this->thread_dim_keys_[i][shard_id][0].insert(cur_key); + } + }; for (int i = 0; i < thread_keys_thread_num_; i++) { if (!multi_mf_dim_) { VLOG(1) << "psgpu graph wrapper genfunc"; @@ -609,8 +609,8 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; for (size_t k = 0; k < len; k++) { void* val = mem_pool->mem_address(k); - // float* ptr_val = device_dim_ptrs[k]->data(); - // size_t dim = device_dim_ptrs[k]->size(); + // float* ptr_val = device_dim_ptrs[k]->data(); + // size_t dim = device_dim_ptrs[k]->size(); #ifdef PADDLE_WITH_PSLIB val->delta_score = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: @@ -648,7 +648,8 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { #ifdef PADDLE_WITH_PSCORE // VLOG(5) << "cpu build " << k // << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) - // << " |: " << cpu_table_accessor_->ParseToString(ptr_val, dim); + // << " |: " << cpu_table_accessor_->ParseToString(ptr_val, + // dim); accessor_wrapper_ptr->BuildFill(val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim); VLOG(5) << "build " << k << " : " @@ -687,7 +688,7 @@ for (std::thread& t : threads) { } timeline.Pause(); VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() << " s."; -} +} // namespace framework void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { platform::Timer timer; @@ -864,7 +865,8 @@ void PSGPUWrapper::EndPass() { // float* cpu_val = downpour_value->data(); // VLOG(5) << "dump to cpu " << index << " gpu_value: " // << accessor_wrapper_ptr->ParseToString(gpu_val, - // int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / sizeof(float))) + // int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / + // sizeof(float))) // << " \t cpu_value:" // << cpu_table_accessor_->ParseToString(cpu_val, // downpour_value->size()); @@ -897,7 +899,7 @@ current_task_ = nullptr; gpu_free_channel_->Put(current_task_); timer.Pause(); VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; -} +} // namespace paddle void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, const int table_id, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index cf37737716f4c2..e94af9b3c1a67b 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -25,68 +25,6 @@ limitations under the License. */ namespace paddle { namespace framework { -// __global__ void PullCopy(float** dest, const FeatureValue* src, -// const int64_t* len, int hidden, int slot_num, -// int total_len, uint64_t** keys) { -// CUDA_KERNEL_LOOP(i, total_len) { -// int low = 0; -// int high = slot_num - 1; -// while (low < high) { -// int mid = (low + high) / 2; -// if (i < len[mid]) -// high = mid; -// else -// low = mid + 1; -// } -// int x = low; -// int y = i - (x ? len[x - 1] : 0); -// if (*(keys[x] + y) == 0) { -// *(dest[x] + y * hidden) = 0; -// *(dest[x] + y * hidden + 1) = 0; -// *(dest[x] + y * hidden + 2) = 0; -// } else { -// *(dest[x] + y * hidden) = (src + i)->show; -// *(dest[x] + y * hidden + 1) = (src + i)->clk; -// *(dest[x] + y * hidden + 2) = (src + i)->lr; -// } -// if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) { -// for (int j = 0; j < hidden - 3; j++) { -// *(dest[x] + y * hidden + 3 + j) = 0; -// } -// } else { -// for (int j = 0; j < hidden - 3; j++) { -// *(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j]; -// } -// } -// } -// } - -// template -// __global__ void PullCopy(float** dest, const float* src, -// const int64_t* len, int slot_num, int total_len, -// uint64_t** keys, uint64_t max_val_size, int* -// gpu_dim, -// FVAccessor feature_value_accessor) { -// CUDA_KERNEL_LOOP(i, total_len) { -// int low = 0; -// int high = slot_num - 1; -// while (low < high) { -// int mid = (low + high) / 2; -// if (i < len[mid]) -// high = mid; -// else -// low = mid + 1; -// } -// int x = low; -// int y = i - (x ? len[x - 1] : 0); -// float* feature_value_ptr = -// (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); -// int mf_dim = gpu_dim[x] - 3; -// feature_value_accessor.Select(dest[x] + y * (mf_dim + 3), -// feature_value_ptr, keys[x] + y, mf_dim); -// } -// } - __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, const int64_t* len, int slot_num, int total_len) { @@ -145,87 +83,6 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, cudaStreamSynchronize(stream); } -// void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, -// const std::vector& grad_values, -// FeaturePushValue* total_grad_values_gpu, -// const std::vector& slot_lengths, -// const int hidden_size, -// const int64_t total_length, -// const int batch_size) { -// auto stream = dynamic_cast( -// platform::DeviceContextPool::Instance().Get(place)) -// ->stream(); -// auto slot_lengths_lod = slot_lengths; -// for (int i = 1; i < slot_lengths_lod.size(); i++) { -// slot_lengths_lod[i] += slot_lengths_lod[i - 1]; -// } -// auto buf_grad_value = -// memory::Alloc(place, grad_values.size() * sizeof(float*)); -// auto buf_length = memory::Alloc(place, slot_lengths.size() * -// sizeof(int64_t)); -// auto buf_slot_vector = -// memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - -// float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); -// int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); -// int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); - -// cudaMemcpy(gpu_values, grad_values.data(), -// grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); -// cudaMemcpy(gpu_len, slot_lengths_lod.data(), -// slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); -// cudaMemcpy(d_slot_vector, slot_vector_.data(), -// slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); - -// PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( -// total_grad_values_gpu, gpu_values, gpu_len, hidden_size, -// slot_lengths.size(), total_length, batch_size, d_slot_vector); -// cudaStreamSynchronize(stream); -// } - -// void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, -// const std::vector& grad_values, -// float* total_grad_values_gpu, -// const std::vector& slot_lengths, -// const uint64_t total_length, -// const int batch_size, size_t grad_value_size) -// { -// auto stream = dynamic_cast( -// platform::DeviceContextPool::Instance().Get(place)) -// ->stream(); -// auto slot_lengths_lod = slot_lengths; -// for (int i = 1; i < slot_lengths_lod.size(); i++) { -// slot_lengths_lod[i] += slot_lengths_lod[i - 1]; -// } -// auto buf_grad_value = -// memory::Alloc(place, grad_values.size() * sizeof(float*)); -// auto buf_length = memory::Alloc(place, slot_lengths.size() * -// sizeof(int64_t)); -// auto buf_slot_vector = -// memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); -// auto buf_mf_dim_vector = -// memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); -// float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); -// int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); -// int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); -// int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); -// cudaMemcpy(gpu_values, grad_values.data(), -// grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); -// cudaMemcpy(gpu_len, slot_lengths_lod.data(), -// slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); -// cudaMemcpy(d_slot_vector, slot_vector_.data(), -// slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); -// cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector_.data(), -// slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); -// auto accessor_wrapper_ptr = -// GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); -// PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( -// total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(), -// total_length, batch_size, d_slot_vector, d_mf_dim_vector, -// grad_value_size, accessor_wrapper_ptr->GetGPUAccessor()); -// cudaStreamSynchronize(stream); -// } - void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float min_bound, float max_bound, float learning_rate, float initial_g2sum, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 6369a68f67d61b..101de5dbc72e8c 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -128,28 +128,6 @@ class PSGPUWrapper { void CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, int total_len); - // void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, - // const std::vector& values, - // const FeatureValue* total_values_gpu, const int64_t* - // gpu_len, const int slot_num, const int hidden_size, const - // int64_t total_length); - // void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, - // const std::vector& values, - // const float* total_values_gpu, const int64_t* gpu_len, - // const int slot_num, const int hidden_size, - // const int64_t total_length, int* gpu_dim); - // void CopyForPush(const paddle::platform::Place& place, - // const std::vector& grad_values, - // FeaturePushValue* total_grad_values_gpu, - // const std::vector& slot_lengths, - // const int hidden_size, const int64_t total_length, - // const int batch_size); - // void CopyForPush(const paddle::platform::Place& place, - // const std::vector& grad_values, - // float* total_grad_values_gpu, - // const std::vector& slot_lengths, - // const uint64_t total_length, const int batch_size, - // size_t grad_value_size); void BuildGPUTask(std::shared_ptr gpu_task); void PreBuildTask(std::shared_ptr gpu_task); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps index b9a9b961ecf859..df8ad45bb472da 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps @@ -169,34 +169,6 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len, PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } -// void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, -// uint64_t** gpu_keys, -// const std::vector& values, -// const FeatureValue* total_values_gpu, -// const int64_t* gpu_len, const int slot_num, -// const int hidden_size, -// const int64_t total_length) { -// XPUStream stream = nullptr; -// auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); -// stream = static_cast(dev_ctx) -// ->x_context() -// ->xpu_stream; -// float* buf_value = nullptr; -// xpu_malloc(reinterpret_cast(&buf_value), -// values.size() * sizeof(float*)); -// float** gpu_values = reinterpret_cast(&buf_value); -// xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*), -// XPU_HOST_TO_DEVICE); - -// unsigned long long** c_keys = (unsigned long long**)gpu_keys; -// const long long* c_len = (const long long*)gpu_len; -// PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, -// hidden_size, -// slot_num, total_length, c_keys); - -// xpu_wait(stream); -// } - void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, @@ -214,51 +186,6 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, xpu_wait(stream); } -// void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, -// const std::vector& grad_values, -// FeaturePushValue* total_grad_values_gpu, -// const std::vector& slot_lengths, -// const int hidden_size, -// const int64_t total_length, -// const int batch_size) { -// XPUStream stream = nullptr; -// auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); -// stream = static_cast(dev_ctx) -// ->x_context() -// ->xpu_stream; -// auto slot_lengths_lod = slot_lengths; -// for (size_t i = 1; i < slot_lengths_lod.size(); i++) { -// slot_lengths_lod[i] += slot_lengths_lod[i - 1]; -// } - -// float* buf_grad_value = nullptr; -// int64_t* buf_length = nullptr; -// int* buf_slot_vector = nullptr; - -// xpu_malloc(reinterpret_cast(&buf_grad_value), -// grad_values.size() * sizeof(float*)); -// xpu_malloc(reinterpret_cast(&buf_length), -// slot_lengths.size() * sizeof(int64_t)); -// xpu_malloc(reinterpret_cast(&buf_slot_vector), -// slot_lengths_lod.size() * sizeof(int)); - -// float** gpu_values = reinterpret_cast(&buf_grad_value); -// int64_t* gpu_len = reinterpret_cast(buf_length); -// int* d_slot_vector = reinterpret_cast(buf_slot_vector); -// xpu_memcpy(gpu_values, grad_values.data(), -// grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE); -// xpu_memcpy(gpu_len, slot_lengths_lod.data(), -// slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE); -// xpu_memcpy(d_slot_vector, slot_vector_.data(), -// slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE); - -// long long* c_len = (long long*)gpu_len; -// PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len, -// hidden_size, slot_lengths.size(), total_length, -// batch_size, d_slot_vector); -// xpu_wait(stream); -// } - } // end namespace framework } // end namespace paddle #endif From f6ed501e0821ab0ca8db11b61b0badbd6d0fbd30 Mon Sep 17 00:00:00 2001 From: danleifeng Date: Thu, 7 Jul 2022 07:51:51 +0000 Subject: [PATCH 08/12] format; test=develop --- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index c9ff3bded53dc8..f19b0c41299e3f 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -872,34 +872,34 @@ void PSGPUWrapper::EndPass() { // downpour_value->size()); } #endif - free(test_build_values); -}; -if (multi_mf_dim_) { - VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; - size_t device_num = heter_devices_.size(); - std::vector threads(device_num * multi_mf_dim_); - for (size_t i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); + free(test_build_values); + }; + if (multi_mf_dim_) { + VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; + size_t device_num = heter_devices_.size(); + std::vector threads(device_num * multi_mf_dim_); + for (size_t i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); + } + } + for (std::thread& t : threads) { + t.join(); } } - for (std::thread& t : threads) { - t.join(); + if (keysize_max != 0) { + HeterPs_->end_pass(); } -} -if (keysize_max != 0) { - HeterPs_->end_pass(); -} -VLOG(0) << "HeterPs_->end_pass end"; -for (size_t i = 0; i < hbm_pools_.size(); i++) { - delete hbm_pools_[i]; -} -gpu_task_pool_.Push(current_task_); -current_task_ = nullptr; -gpu_free_channel_->Put(current_task_); -timer.Pause(); -VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; -} // namespace paddle + VLOG(0) << "HeterPs_->end_pass end"; + for (size_t i = 0; i < hbm_pools_.size(); i++) { + delete hbm_pools_[i]; + } + gpu_task_pool_.Push(current_task_); + current_task_ = nullptr; + gpu_free_channel_->Put(current_task_); + timer.Pause(); + VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; + } // namespace paddle void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, const int table_id, From 9e24bfb63b5b70080be662ed18a87377cd897408 Mon Sep 17 00:00:00 2001 From: danleifeng Date: Wed, 20 Jul 2022 08:49:47 +0000 Subject: [PATCH 09/12] add ut; test=develop --- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 176 +++++++++--------- .../test_fleet_distributed_strategy.py | 7 + 2 files changed, 96 insertions(+), 87 deletions(-) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index f19b0c41299e3f..82498d828ece77 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -591,73 +591,75 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { HeterPs_->set_sparse_sgd(optimizer_config_); HeterPs_->set_embedx_sgd(optimizer_config_); #endif - auto build_dynamic_mf_func = - [this, &gpu_task, &accessor_wrapper_ptr](int i, int j) { - this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); - int mf_dim = this->index_dim_vec_[j]; - VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim - << " feature_value_size:" - << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); - size_t feature_value_size = - accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); - auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; - auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; - size_t len = device_dim_keys.size(); - CHECK(len == device_dim_ptrs.size()); - this->mem_pools_[i * this->multi_mf_dim_ + j] = - new MemoryPool(len, feature_value_size); - auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; - for (size_t k = 0; k < len; k++) { - void* val = mem_pool->mem_address(k); - // float* ptr_val = device_dim_ptrs[k]->data(); - // size_t dim = device_dim_ptrs[k]->size(); + auto build_dynamic_mf_func = [this, &gpu_task, &accessor_wrapper_ptr](int i, + int j) { + this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); + int mf_dim = this->index_dim_vec_[j]; + VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim + << " feature_value_size:" + << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; + size_t len = device_dim_keys.size(); + CHECK(len == device_dim_ptrs.size()); + this->mem_pools_[i * this->multi_mf_dim_ + j] = + new MemoryPool(len, feature_value_size); + auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; + #ifdef PADDLE_WITH_PSLIB - val->delta_score = - ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::delta_score_index()]; - val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::show_index()]; - val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::click_index()]; - val->slot = - int(ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::slot_index()]); - val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::embed_w_index()]; - val->lr_g2sum = - ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::embed_g2sum_index()]; - // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor + for (size_t k = 0; k < len; k++) { + float* ptr_val = device_dim_ptrs[k]->data(); + size_t dim = device_dim_ptrs[k]->size(); + val->delta_score = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::mf_dim_index()] = - float(mf_dim); - val->mf_dim = mf_dim; - if (dim > 8) { // CpuPS alreay expand as mf_dim - val->mf_size = mf_dim + 1; - for (int x = 0; x < val->mf_dim + 1; x++) { - val->mf[x] = ptr_val[x + 8]; - } - } else { - val->mf_size = 0; - for (int x = 0; x < val->mf_dim + 1; x++) { - val->mf[x] = 0; - } - } + DownpourCtrDymfFeatureValue::delta_score_index()]; + val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::show_index()]; + val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::click_index()]; + val->slot = int(ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()]); + val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_w_index()]; + val->lr_g2sum = + ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_g2sum_index()]; + // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor + ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + mf_dim_index()] = float(mf_dim); + val->mf_dim = mf_dim; + if (dim > 8) { // CpuPS alreay expand as mf_dim + val->mf_size = mf_dim + 1; + for (int x = 0; x < val->mf_dim + 1; x++) { + val->mf[x] = ptr_val[x + 8]; + } + } else { + val->mf_size = 0; + for (int x = 0; x < val->mf_dim + 1; x++) { + val->mf[x] = 0; } + } + } #endif #ifdef PADDLE_WITH_PSCORE - // VLOG(5) << "cpu build " << k - // << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) - // << " |: " << cpu_table_accessor_->ParseToString(ptr_val, - // dim); - accessor_wrapper_ptr->BuildFill(val, device_dim_ptrs[k], - cpu_table_accessor_, mf_dim); - VLOG(5) << "build " << k << " : " - << accessor_wrapper_ptr->ParseToString( - (float*)(val), - int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / - sizeof(float))); - } + for (size_t k = 0; k < len; k++) { + void* val = mem_pool->mem_address(k); + // float* ptr_val = device_dim_ptrs[k]->data(); + // size_t dim = device_dim_ptrs[k]->size(); + // VLOG(5) << "cpu build " << k + // << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) + // << " |: " << cpu_table_accessor_->ParseToString(ptr_val, + // dim); + accessor_wrapper_ptr->BuildFill(val, device_dim_ptrs[k], + cpu_table_accessor_, mf_dim); + VLOG(5) << "build " << k << " : " + << accessor_wrapper_ptr->ParseToString( + (float*)(val), + int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / + sizeof(float))); + } #endif platform::CUDADeviceGuard guard(resource_->dev_id(i)); @@ -872,34 +874,34 @@ void PSGPUWrapper::EndPass() { // downpour_value->size()); } #endif - free(test_build_values); - }; - if (multi_mf_dim_) { - VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; - size_t device_num = heter_devices_.size(); - std::vector threads(device_num * multi_mf_dim_); - for (size_t i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); - } - } - for (std::thread& t : threads) { - t.join(); + free(test_build_values); +}; +if (multi_mf_dim_) { + VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; + size_t device_num = heter_devices_.size(); + std::vector threads(device_num * multi_mf_dim_); + for (size_t i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); } } - if (keysize_max != 0) { - HeterPs_->end_pass(); - } - VLOG(0) << "HeterPs_->end_pass end"; - for (size_t i = 0; i < hbm_pools_.size(); i++) { - delete hbm_pools_[i]; + for (std::thread& t : threads) { + t.join(); } - gpu_task_pool_.Push(current_task_); - current_task_ = nullptr; - gpu_free_channel_->Put(current_task_); - timer.Pause(); - VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; - } // namespace paddle +} +if (keysize_max != 0) { + HeterPs_->end_pass(); +} +VLOG(0) << "HeterPs_->end_pass end"; +for (size_t i = 0; i < hbm_pools_.size(); i++) { + delete hbm_pools_[i]; +} +gpu_task_pool_.Push(current_task_); +current_task_ = nullptr; +gpu_free_channel_->Put(current_task_); +timer.Pause(); +VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; +} // namespace paddle void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, const int table_id, diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py index ffc3f2b21a476e..f81ea5f5572c5e 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py @@ -326,6 +326,13 @@ def test_fleet_desc_configs(self): .accessor.embed_sgd_param.adagrad.initial_range, 0.0001) + strategy = paddle.distributed.fleet.DistributedStrategy() + configs = {} + configs['emb'] = {"sparse_optimizer": "shared_adam"} + strategy.fleet_desc_configs = configs + self.assertEqual(strategy.sparse_table_configs[0] + .accessor.embed_sgd_param.adam.beta1_decay_rate, 0.9) + def test_trainer_desc_configs(self): strategy = paddle.distributed.fleet.DistributedStrategy() configs = { From 71a4b2fb1b58191305586f217435c29aa43c1758 Mon Sep 17 00:00:00 2001 From: danleifeng Date: Thu, 21 Jul 2022 14:31:57 +0800 Subject: [PATCH 10/12] fix format --- .../framework/fleet/heter_ps/heter_comm.h | 4 +- .../framework/fleet/heter_ps/heter_comm_inl.h | 104 ++++---- .../fleet/heter_ps/heter_comm_kernel.cu | 241 ++++++++---------- 3 files changed, 155 insertions(+), 194 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 3d3e23c7967f23..0557b66d8655d1 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -253,8 +253,6 @@ class HeterComm { void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, size_t val_size); - FVAccessor feature_value_accessor_; - protected: void pull_merge_sparse(int num, KeyType* d_keys, float* d_vals, size_t len); void pull_normal_sparse(int num, KeyType* d_keys, float* d_vals, size_t len); @@ -269,6 +267,8 @@ class HeterComm { int block_size_{256}; std::unique_ptr heter_comm_kernel_; + FVAccessor feature_value_accessor_; + private: int topo_aware_{0}; std::vector storage_; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 9b4a33972eb44b..364193e6eb2568 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -59,7 +59,7 @@ HeterComm::HeterComm( accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); size_t grad_type_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); - size_t pull_type_size = + size_t pull_type_size = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_); VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size @@ -853,13 +853,13 @@ void HeterComm::split_input_to_shard( } template + typename FVAccessor> void HeterComm::merge_keys( - int gpu_num, const KeyType* d_keys, size_t len, // input - KeyType* d_sorted_keys, // output - KeyType* d_merged_keys, // output - uint32_t* d_restore_idx, // output - size_t& uniq_len) { // output + int gpu_num, const KeyType* d_keys, size_t len, // input + KeyType* d_sorted_keys, // output + KeyType* d_merged_keys, // output + uint32_t* d_restore_idx, // output + size_t& uniq_len) { // output int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); @@ -871,7 +871,8 @@ void HeterComm::merge_keys( size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 4 + 1)); - uint32_t* d_fea_num_info_ptr = reinterpret_cast(d_fea_num_info->ptr()); + uint32_t* d_fea_num_info_ptr = + reinterpret_cast(d_fea_num_info->ptr()); uint32_t* d_idx = (uint32_t*)&d_fea_num_info_ptr[len]; uint32_t* d_index = (uint32_t*)&d_idx[len]; uint32_t* d_offset = (uint32_t*)&d_index[len]; @@ -880,18 +881,18 @@ void HeterComm::merge_keys( size_t temp_storage_bytes; PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, d_keys, d_sorted_keys, d_idx, d_index, len, - 0, 8 * sizeof(KeyType), stream)); + NULL, temp_storage_bytes, d_keys, d_sorted_keys, d_idx, d_index, len, 0, + 8 * sizeof(KeyType), stream)); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_sorted_keys, - d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream)); + d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_sorted_keys, d_idx, + d_index, len, 0, 8 * sizeof(KeyType), stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); temp_storage_bytes = 0; PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - NULL, temp_storage_bytes, d_sorted_keys, d_merged_keys, d_fea_num_info_ptr, - d_merged_size, len, stream)); + NULL, temp_storage_bytes, d_sorted_keys, d_merged_keys, + d_fea_num_info_ptr, d_merged_size, len, stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -912,20 +913,20 @@ void HeterComm::merge_keys( d_temp_storage = memory::Alloc(place, temp_storage_bytes); } PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len, - stream)); + d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, + uniq_len, stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - heter_comm_kernel_->fill_restore_idx( - true, len, uniq_len, d_merged_keys, d_index, d_offset, - d_fea_num_info_ptr, d_restore_idx, stream); + heter_comm_kernel_->fill_restore_idx(true, len, uniq_len, d_merged_keys, + d_index, d_offset, d_fea_num_info_ptr, + d_restore_idx, stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } template + typename FVAccessor> void HeterComm::pull_merge_sparse( - int num, KeyType* d_keys, float* d_vals, size_t len) { + int num, KeyType* d_keys, float* d_vals, size_t len) { int total_device = resource_->total_device(); int dev_id = resource_->dev_id(num); DevPlace place = DevPlace(dev_id); @@ -963,9 +964,7 @@ void HeterComm::pull_merge_sparse( auto accessor_wrapper_ptr = GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); - size_t val_type_size = - accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_); - feature_value_accessor_.common_pull_value.Size(max_mf_dim_); + size_t val_type_size = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_); VLOG(3) << "pull_sparse len:" << len << " val_type_size: " << val_type_size; auto d_sorted_keys = memory::Alloc(place, len * sizeof(KeyType)); auto d_sorted_keys_ptr = reinterpret_cast(d_sorted_keys->ptr()); @@ -979,19 +978,16 @@ void HeterComm::pull_merge_sparse( auto d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); size_t uniq_len = 0; - merge_keys(num, d_keys, len, - d_sorted_keys_ptr, - d_merged_keys_ptr, - d_restore_idx_ptr, - uniq_len); + merge_keys(num, d_keys, len, d_sorted_keys_ptr, d_merged_keys_ptr, + d_restore_idx_ptr, uniq_len); sync_stream(stream); auto d_idx = memory::Alloc(place, uniq_len * sizeof(int)); auto d_idx_ptr = reinterpret_cast(d_idx->ptr()); - split_input_to_shard(d_merged_keys_ptr, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, num); - heter_comm_kernel_->fill_shard_key( - d_shard_keys_ptr, d_merged_keys_ptr, d_idx_ptr, uniq_len, - stream); + split_input_to_shard(d_merged_keys_ptr, d_idx_ptr, uniq_len, d_left_ptr, + d_right_ptr, num); + heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, d_merged_keys_ptr, + d_idx_ptr, uniq_len, stream); sync_stream(stream); auto dst_place = platform::CPUPlace(); @@ -1055,17 +1051,14 @@ void HeterComm::pull_merge_sparse( auto d_merged_vals = memory::Alloc(place, uniq_len * val_type_size); auto d_merged_vals_ptr = reinterpret_cast(d_merged_vals->ptr()); - heter_comm_kernel_->dy_mf_fill_dvals( - d_shard_vals_ptr, d_merged_vals_ptr, - d_idx_ptr, uniq_len, - val_type_size, stream); + heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, d_merged_vals_ptr, + d_idx_ptr, uniq_len, val_type_size, + stream); sync_stream(stream); - heter_comm_kernel_->unpack_merged_vals( - len, d_keys, - d_merged_vals_ptr, - d_restore_idx_ptr, - d_vals, val_type_size, stream); + heter_comm_kernel_->unpack_merged_vals(len, d_keys, d_merged_vals_ptr, + d_restore_idx_ptr, d_vals, + val_type_size, stream); sync_stream(stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { @@ -1078,9 +1071,9 @@ void HeterComm::pull_merge_sparse( } } template + typename FVAccessor> void HeterComm::pull_normal_sparse( - int num, KeyType* d_keys, float* d_vals, size_t len) { + int num, KeyType* d_keys, float* d_vals, size_t len) { int total_device = resource_->total_device(); int dev_id = resource_->dev_id(num); DevPlace place = DevPlace(dev_id); @@ -1208,8 +1201,10 @@ void HeterComm::pull_normal_sparse( } template -void HeterComm::pull_sparse( - int num, KeyType* d_keys, float* d_vals, size_t len) { +void HeterComm::pull_sparse(int num, + KeyType* d_keys, + float* d_vals, + size_t len) { if (len == 0) { return; } @@ -1711,16 +1706,11 @@ void HeterComm::end_pass() { template int HeterComm::dedup_keys_and_fillidx( - const int gpu_id, - const int total_fea_num, + const int gpu_id, const int total_fea_num, const KeyType* d_keys, // input KeyType* d_merged_keys, // output - KeyType* d_sorted_keys, - uint32_t* d_restore_idx, - uint32_t* d_sorted_idx, - uint32_t* d_offset, - uint32_t* d_merged_cnts, - bool filter_zero) { + KeyType* d_sorted_keys, uint32_t* d_restore_idx, uint32_t* d_sorted_idx, + uint32_t* d_offset, uint32_t* d_merged_cnts, bool filter_zero) { int dev_id = resource_->dev_id(gpu_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); @@ -1777,9 +1767,9 @@ int HeterComm::dedup_keys_and_fillidx( cudaMemsetAsync(d_restore_idx, 0, total_fea_num * sizeof(uint32_t), stream); } // fill restore idx [1,3,5,2,4,6] = [1,2,1,3,2,1] - heter_comm_kernel_->fill_restore_idx(filter_zero, - total_fea_num, merged_size, d_merged_keys, d_sorted_idx, - d_offset, d_merged_cnts, d_restore_idx, stream); + heter_comm_kernel_->fill_restore_idx(filter_zero, total_fea_num, merged_size, + d_merged_keys, d_sorted_idx, d_offset, + d_merged_cnts, d_restore_idx, stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index 34cc9590aefd77..93ad75195882b5 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -117,141 +117,112 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals, } } -// template -// __global__ void dy_mf_fill_shard_grads_kernel( -// KeyType* d_shard_keys, KeyType* d_keys, float* d_shard_grads, -// float* d_grads, T* idx, size_t len, size_t grad_value_size, -// FVAccessor feature_value_accessor) { -// const size_t i = blockIdx.x * blockDim.x + threadIdx.x; -// if (i < len) { -// d_shard_keys[i] = d_keys[idx[i]]; -// float* cur = (float*)((char*)d_shard_grads + i * grad_value_size); -// float* shard_val = -// (float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); - -// feature_value_accessor.PushValueFill(cur, shard_val); -// } -// } - -// template -// __global__ void merge_gradients_basic_kernel( -// const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, -// const uint32_t* index, const char* input, char* output, int n, -// size_t grad_value_size, DynamicGradMerger& merger, -// FVAccessor& feature_value_accessor) { -// const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - -// if (i < n) { -// uint32_t start = offset[i]; -// uint32_t num = fea_num[i]; -// int ori_index = index[start]; -// float* out = (float*)(output + i * grad_value_size); -// float* in = (float*)(input + size_t(ori_index) * grad_value_size); -// merger.update_basic(out, in, feature_value_accessor); -// KeyType key = d_keys[i]; -// if (key != 0) { -// for (int j = 1; j < num; ++j) { -// ori_index = index[start + j]; -// in = (float*)(input + size_t(ori_index) * grad_value_size); -// merger.merge_basic(out, in, feature_value_accessor); -// } -// } -// } -// } - -// template -// __global__ void merge_gradients_embedx_kernel( -// const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, -// const uint32_t* index, const char* input, char* output, int n, -// size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger, -// FVAccessor& feature_value_accessor) { -// const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - -// if (i < n) { -// size_t value_idx = i / grad_dim; -// size_t field_idx = i % grad_dim; -// uint32_t start = offset[value_idx]; -// uint32_t num = fea_num[value_idx]; -// int ori_index = index[start]; -// float* in = (float*)(input + size_t(ori_index) * grad_value_size); -// float* out = (float*)(output + value_idx * grad_value_size); -// merger.update_embedx(out, in, field_idx, feature_value_accessor); -// KeyType key = d_keys[value_idx]; -// if (key != 0) { -// for (int j = 1; j < num; ++j) { -// int ori_index = index[start + j]; -// float* in = (float*)(input + size_t(ori_index) * grad_value_size); -// merger.merge_embedx(out, in, field_idx, feature_value_accessor); -// } -// } -// } -// } - -// __global__ void split_segments_kernel(const uint32_t* d_fea_num_info, size_t n, -// uint32_t* d_segments, -// uint32_t* d_segments_num, -// uint32_t segment_size) { -// const size_t tx = blockIdx.x * blockDim.x + threadIdx.x; -// if (tx >= n) { -// return; -// } - -// auto fea_num = d_fea_num_info[tx]; -// auto seg_num = (uint32_t)((fea_num - 1) / segment_size + 1); -// d_segments[tx] = seg_num; -// } - -// __global__ void expand_segments_kernel(const uint32_t* d_fea_num_info, -// const uint32_t* d_segments_offset, -// size_t n, -// uint32_t* d_segments_fea_num_info, -// uint32_t segment_size) { -// const size_t tx = blockIdx.x * blockDim.x + threadIdx.x; -// if (tx >= n) { -// return; -// } - -// auto fea_num = d_fea_num_info[tx]; -// auto seg_num = (uint32_t)((fea_num - 1) / segment_size + 1); -// auto start_pos = d_segments_offset[tx]; -// auto remains = fea_num; -// int cur_seg_size = 0; -// for (size_t i = 0; i < seg_num; ++i) { -// if (remains >= segment_size) { -// cur_seg_size = segment_size; -// } else { -// cur_seg_size = remains; -// } -// d_segments_fea_num_info[start_pos + i] = cur_seg_size; -// remains -= cur_seg_size; -// } -// } - -// template -// __global__ void shrink_keys_kernel(const KeyType* d_keys, -// const uint32_t* d_segments_offset, -// KeyType* d_segments_keys, size_t n) { -// const size_t tx = blockIdx.x * blockDim.x + threadIdx.x; -// if (tx >= n) { -// return; -// } - -// d_segments_keys[tx] = d_keys[d_segments_offset[tx]]; -// } - -// template -// __global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals, float* d_vals, -// T* idx, size_t len, size_t val_size, -// FVAccessor feature_value_accessor) { -// const size_t i = blockIdx.x * blockDim.x + threadIdx.x; -// if (i < len) { -// uint64_t new_offset = uint64_t(idx[i]) * val_size; -// float* cur = (float*)((char*)d_vals + new_offset); -// float* shard_val = (float*)((char*)d_shard_vals + uint64_t(i) * val_size); -// int mf_dim = int( -// shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]); - -// feature_value_accessor.FeatureValueFill(cur, shard_val, mf_dim); +template +__global__ void merge_gradients_basic_kernel( + const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, + const uint32_t* index, const char* input, char* output, int n, + size_t grad_value_size, DynamicGradMerger& merger, + FVAccessor& feature_value_accessor) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < n) { + uint32_t start = offset[i]; + uint32_t num = fea_num[i]; + int ori_index = index[start]; + float* out = (float*)(output + i * grad_value_size); + float* in = (float*)(input + size_t(ori_index) * grad_value_size); + merger.update_basic(out, in, feature_value_accessor); + KeyType key = d_keys[i]; + if (key != 0) { + for (int j = 1; j < num; ++j) { + ori_index = index[start + j]; + in = (float*)(input + size_t(ori_index) * grad_value_size); + merger.merge_basic(out, in, feature_value_accessor); + } + } + } +} + +template +__global__ void merge_gradients_embedx_kernel( + const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, + const uint32_t* index, const char* input, char* output, int n, + size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger, + FVAccessor& feature_value_accessor) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < n) { + size_t value_idx = i / grad_dim; + size_t field_idx = i % grad_dim; + uint32_t start = offset[value_idx]; + uint32_t num = fea_num[value_idx]; + int ori_index = index[start]; + float* in = (float*)(input + size_t(ori_index) * grad_value_size); + float* out = (float*)(output + value_idx * grad_value_size); + merger.update_embedx(out, in, field_idx, feature_value_accessor); + KeyType key = d_keys[value_idx]; + if (key != 0) { + for (int j = 1; j < num; ++j) { + int ori_index = index[start + j]; + float* in = (float*)(input + size_t(ori_index) * grad_value_size); + merger.merge_embedx(out, in, field_idx, feature_value_accessor); + } + } + } +} + +__global__ void split_segments_kernel(const uint32_t* d_fea_num_info, size_t n, + uint32_t* d_segments, + uint32_t* d_segments_num, + uint32_t segment_size) { + const size_t tx = blockIdx.x * blockDim.x + threadIdx.x; + if (tx >= n) { + return; + } + + auto fea_num = d_fea_num_info[tx]; + auto seg_num = (uint32_t)((fea_num - 1) / segment_size + 1); + d_segments[tx] = seg_num; +} + +__global__ void expand_segments_kernel(const uint32_t* d_fea_num_info, + const uint32_t* d_segments_offset, + size_t n, + uint32_t* d_segments_fea_num_info, + uint32_t segment_size) { + const size_t tx = blockIdx.x * blockDim.x + threadIdx.x; + if (tx >= n) { + return; + } + + auto fea_num = d_fea_num_info[tx]; + auto seg_num = (uint32_t)((fea_num - 1) / segment_size + 1); + auto start_pos = d_segments_offset[tx]; + auto remains = fea_num; + int cur_seg_size = 0; + for (size_t i = 0; i < seg_num; ++i) { + if (remains >= segment_size) { + cur_seg_size = segment_size; + } else { + cur_seg_size = remains; + } + d_segments_fea_num_info[start_pos + i] = cur_seg_size; + remains -= cur_seg_size; + } +} + +template +__global__ void shrink_keys_kernel(const KeyType* d_keys, + const uint32_t* d_segments_offset, + KeyType* d_segments_keys, size_t n) { + const size_t tx = blockIdx.x * blockDim.x + threadIdx.x; + if (tx >= n) { + return; + } + + d_segments_keys[tx] = d_keys[d_segments_offset[tx]]; +} + template __global__ void unpack_merged_vals_kernel( const KeyType* d_keys, From 67f4e1e39e0ceb89ecd54e9e6ebd29c24fb1ec4b Mon Sep 17 00:00:00 2001 From: danleifeng Date: Thu, 21 Jul 2022 16:57:20 +0800 Subject: [PATCH 11/12] fix format --- .../framework/fleet/heter_ps/feature_value.cu | 10 +- .../framework/fleet/heter_ps/feature_value.h | 68 ++----- .../framework/fleet/heter_ps/heter_comm_inl.h | 11 +- .../framework/fleet/heter_ps/heter_ps.cu | 2 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 179 +++++++----------- 5 files changed, 108 insertions(+), 162 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu index 53836b077fbb9b..25ad7e99f28954 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -13,10 +13,15 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" namespace paddle { namespace framework { +const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; +#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) +#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 + template __global__ void PullCopy(float** dest, const float* src, @@ -367,13 +372,14 @@ void AccessorWrapper::CopyForPullDedupImpl( const int hidden_size, const int64_t total_length, const int* slot_dims, - const uint32_t* gpu_restore_idx) { + const uint32_t* gpu_restore_idx, + int pull_value_size) { auto stream = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); size_t N = total_length * hidden_size; PullDedupCopy<<>>( - N, total_keys, gpu_values, total_values_gpu, slot_lens, pull_type_size_, + N, total_keys, gpu_values, total_values_gpu, slot_lens, pull_value_size, slot_dims, hidden_size, key2slot, gpu_restore_idx, gpu_accessor_.common_pull_value); cudaStreamSynchronize(stream); diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index 9179358697ecdc..d237521eecf91e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -183,6 +183,13 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { float mf_size std::vector embedx_w; */ + __host__ __device__ static int Dim(int embedx_dim) { + return 4 + embedx_dim; + } + __host__ __device__ int DimSize(size_t dim) { return sizeof(float); } + __host__ __device__ int Size(int embedx_dim) { + return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float)); + } __host__ __device__ int ShowIndex() { return 0; } __host__ __device__ int ClickIndex() { return 1; } __host__ __device__ int EmbedWIndex() { return 2; } @@ -190,9 +197,6 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { return 3; } // actual mf size (ex. 0) __host__ __device__ int EmbedxWIndex() { return 4; } - __host__ __device__ int Size(const int mf_dim) { - return (4 + mf_dim) * sizeof(float); - } }; struct CommonPushValue { @@ -249,39 +253,6 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { } }; - struct CommonPullValue { - /* - float show; - float click; - float embed_w; - std::vector embedx_w; - */ - - __host__ __device__ static int Dim(int embedx_dim) { - return 3 + embedx_dim; - } - __host__ __device__ int DimSize(size_t dim) { return sizeof(float); } - __host__ __device__ int Size(int embedx_dim) { - return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float)); - } - __host__ __device__ int ShowIndex() { return 0; } - __host__ __device__ int ClickIndex() { return 1; } - __host__ __device__ int EmbedWIndex() { return 2; } - __host__ __device__ int EmbedxWIndex() { return 3; } - __host__ __device__ float& Show(float* val) { - return val[CommonPullValue::ShowIndex()]; - } - __host__ __device__ float& Click(float* val) { - return val[CommonPullValue::ClickIndex()]; - } - __host__ __device__ float& EmbedW(float* val) { - return val[CommonPullValue::EmbedWIndex()]; - } - __host__ __device__ float* EmbedxW(float* val) { - return val + CommonPullValue::EmbedxWIndex(); - } - }; - __host__ __device__ CommonFeatureValueAccessor() {} __host__ __device__ ~CommonFeatureValueAccessor() {} @@ -532,22 +503,22 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { *(dest_val + common_pull_value.EmbedWIndex()) = 0; } else { *(dest_val + common_pull_value.ShowIndex()) = - src_val[common_feature_value.ShowIndex()]; + src_val[common_pull_value.ShowIndex()]; *(dest_val + common_pull_value.ClickIndex()) = - src_val[common_feature_value.ClickIndex()]; + src_val[common_pull_value.ClickIndex()]; *(dest_val + common_pull_value.EmbedWIndex()) = - src_val[common_feature_value.EmbedWIndex()]; + src_val[common_pull_value.EmbedWIndex()]; } - if (src_val[common_feature_value.MfSizeIndex()] == 0 || *key == 0) { + if (src_val[common_pull_value.MfSizeIndex()] == 0 || *key == 0) { for (int j = 0; j < mf_dim; j++) { *(dest_val + common_pull_value.EmbedxWIndex() + j) = 0; } } else { for (int j = 0; j < mf_dim; j++) { - *(dest_val + common_pull_value.EmbedxWIndex() + j) = - src_val[common_feature_value.EmbedxWIndex() + j]; - // src_val[common_feature_value.EmbedxWOffsetIndex(src_val) + j]; + // common_pull_value EmbedxWIndex 之前还有 MfSizeIndex, + // 所以这里没有直接使用 common_pull_value.EmbedxWIndex() + *(dest_val + 3 + j) = src_val[common_pull_value.EmbedxWIndex() + j]; } } } @@ -698,7 +669,8 @@ class VirtualAccessor { const int64_t* slot_lens, const int* key2slot, const int hidden_size, const int64_t total_length, const int* slot_dims, - const uint32_t* gpu_restore_idx) = 0; + const uint32_t* gpu_restore_idx, + int pull_value_size) = 0; virtual void CopyForPush(const paddle::platform::Place& place, const std::vector& grad_values, @@ -787,10 +759,11 @@ class AccessorWrapper : public VirtualAccessor { const int64_t* slot_lens, const int* key2slot, const int hidden_size, const int64_t total_length, const int* slot_dims, - const uint32_t* gpu_restore_idx) { + const uint32_t* gpu_restore_idx, + int pull_value_size) { CopyForPullDedupImpl(place, total_keys, gpu_values, total_values_gpu, slot_lens, key2slot, hidden_size, total_length, - slot_dims, gpu_restore_idx); + slot_dims, gpu_restore_idx, pull_value_size); } virtual void CopyForPush(const paddle::platform::Place& place, @@ -857,7 +830,8 @@ class AccessorWrapper : public VirtualAccessor { const int64_t* slot_lens, const int* key2slot, const int hidden_size, const int64_t total_length, const int* slot_dims, - const uint32_t* gpu_restore_idx); + const uint32_t* gpu_restore_idx, + int pull_value_size); void CopyForPushDedupImpl(const paddle::platform::Place& place, const uint64_t* total_keys, float** grad_values, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 364193e6eb2568..d931eace59a32f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -1023,12 +1023,14 @@ void HeterComm::pull_merge_sparse( if (!FLAGS_gpugraph_enable_gpu_direct_access) { ptr_tables_[i]->get(reinterpret_cast(node.key_storage), node.val_storage, h_right[i] - h_left[i] + 1, - resource_->remote_stream(i, num)); + resource_->remote_stream(i, num), + feature_value_accessor_); } else { ptr_tables_[i]->get( d_shard_keys_ptr + h_left[i], reinterpret_cast(d_shard_vals_ptr) + h_left[i] * val_type_size, - h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num)); + h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num), + feature_value_accessor_); } } @@ -1200,8 +1202,9 @@ void HeterComm::pull_normal_sparse( } } -template -void HeterComm::pull_sparse(int num, +template +void HeterComm::pull_sparse(int num, KeyType* d_keys, float* d_vals, size_t len) { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index aff88b01339fbe..32f037b2489442 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -96,7 +96,7 @@ void HeterPs::show_one_table(int gpu_num) { } template -void HeterPs::push_sparse(int num, FeatureKey* d_keys, float* d_grads, +void HeterPs::push_sparse(int num, FeatureKey* d_keys, float* d_grads, size_t len) { if (accessor_type_ == "CtrDymfAccessor") { if (optimizer_type_ == 3) { // adam diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index a16a59964ac8ed..15de47a62283af 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -682,6 +682,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { } delete mem_pool; }; + threads.resize(device_num * multi_mf_dim_); for (int i = 0; i < device_num; i++) { for (int j = 0; j < multi_mf_dim_; j++) { @@ -697,13 +698,6 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { << " s."; } -for (std::thread& t : threads) { - t.join(); -} -timeline.Pause(); -VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() << " s."; -} // namespace framework - void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { platform::Timer timer; VLOG(3) << "Begin LoadIntoMemory(), dataset[" << dataset_ << "]"; @@ -820,107 +814,76 @@ void PSGPUWrapper::EndPass() { auto accessor_wrapper_ptr = GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); - auto dump_pool_to_cpu_func = - [this, &accessor_wrapper_ptr](int i, int j) { - PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); - auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - auto& device_keys = this->current_task_->device_dim_keys_[i][j]; - size_t len = device_keys.size(); - int mf_dim = this->index_dim_vec_[j]; - size_t feature_value_size = - accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); - VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim - << " key_len :" << len - << " feature_value_size:" << feature_value_size; - - char* test_build_values = (char*)malloc(feature_value_size * len); - cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len, - cudaMemcpyDeviceToHost); - - CHECK(len == hbm_pool->capacity()); - uint64_t unuse_key = std::numeric_limits::max(); - for (size_t index = 0; index < len; ++index) { - if (device_keys[index] == unuse_key) { - continue; - } - size_t offset = index * feature_value_size; - float* gpu_val = (float*)(test_build_values + offset); + auto dump_pool_to_cpu_func = [this, &accessor_wrapper_ptr](int i, int j) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); + auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; + auto& device_keys = this->current_task_->device_dim_keys_[i][j]; + size_t len = device_keys.size(); + int mf_dim = this->index_dim_vec_[j]; + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim + << " key_len :" << len + << " feature_value_size:" << feature_value_size; + + char* test_build_values = (char*)malloc(feature_value_size * len); + cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len, + cudaMemcpyDeviceToHost); + + CHECK(len == hbm_pool->capacity()); + uint64_t unuse_key = std::numeric_limits::max(); + for (size_t index = 0; index < len; ++index) { + if (device_keys[index] == unuse_key) { + continue; + } + size_t offset = index * feature_value_size; + float* gpu_val = (float*)(test_build_values + offset); #ifdef PADDLE_WITH_PSLIB - auto* downpour_value = - (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val->mf_size > 0 && downpour_value_size == 8) { - downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::delta_score_index()] = - gpu_val->delta_score; - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::show_index()] = - gpu_val->show; - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::click_index()] = - gpu_val->clk; - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::embed_w_index()] = - gpu_val->lr; - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::embed_g2sum_index()] = - gpu_val->lr_g2sum; - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::slot_index()] = - gpu_val->slot; - - if (gpu_val->mf_size > 0) { - for (int x = 0; x < gpu_val->mf_dim + 1; x++) { - cpu_val[x + 8] = gpu_val->mf[x]; - } - } - } + // TODO: pslib DumpFill #endif #ifdef PADDLE_WITH_PSCORE - accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); - // auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*( - // reinterpret_cast(gpu_val))); - // float* cpu_val = downpour_value->data(); - // VLOG(5) << "dump to cpu " << index << " gpu_value: " - // << accessor_wrapper_ptr->ParseToString(gpu_val, - // int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / - // sizeof(float))) - // << " \t cpu_value:" - // << cpu_table_accessor_->ParseToString(cpu_val, - // downpour_value->size()); - } + accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); + // auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*( + // reinterpret_cast(gpu_val))); + // float* cpu_val = downpour_value->data(); + // VLOG(5) << "dump to cpu " << index << " gpu_value: " + // << accessor_wrapper_ptr->ParseToString(gpu_val, + // int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / + // sizeof(float))) + // << " \t cpu_value:" + // << cpu_table_accessor_->ParseToString(cpu_val, + // downpour_value->size()); + } #endif - free(test_build_values); -}; -if (multi_mf_dim_) { - VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; - size_t device_num = heter_devices_.size(); - std::vector threads(device_num * multi_mf_dim_); - for (size_t i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); + free(test_build_values); + }; + + if (multi_mf_dim_) { + VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; + size_t device_num = heter_devices_.size(); + std::vector threads(device_num * multi_mf_dim_); + for (size_t i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); + } + } + for (std::thread& t : threads) { + t.join(); } } - for (std::thread& t : threads) { - t.join(); + if (keysize_max != 0) { + HeterPs_->end_pass(); } + VLOG(0) << "HeterPs_->end_pass end"; + for (size_t i = 0; i < hbm_pools_.size(); i++) { + delete hbm_pools_[i]; + } + gpu_task_pool_.Push(current_task_); + current_task_ = nullptr; + gpu_free_channel_->Put(current_task_); + timer.Pause(); + VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } -if (keysize_max != 0) { - HeterPs_->end_pass(); -} -VLOG(0) << "HeterPs_->end_pass end"; -for (size_t i = 0; i < hbm_pools_.size(); i++) { - delete hbm_pools_[i]; -} -gpu_task_pool_.Push(current_task_); -current_task_ = nullptr; -gpu_free_channel_->Put(current_task_); -timer.Pause(); -VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; -} // namespace paddle void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, const int table_id, @@ -947,7 +910,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, auto accessor_wrapper_ptr = GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); size_t feature_value_size = - accessor_wrapper_ptr->GetFillValueSize(max_mf_dim_); + accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_); VLOG(3) << "PullSparse max_dim:" << max_mf_dim_ << " pull_feature_value_size:" << pull_type_size_; @@ -1048,9 +1011,9 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, dedup_size); // values.size() not sure equal slot_num - this->CopyForPull(place, total_keys, gpu_values, total_values_gpu, + accessor_wrapper_ptr->CopyForPull(place, total_keys, gpu_values, total_values_gpu, slot_lens, key2slot, max_mf_dim_ + 3, total_length, - gpu_slot_dims, d_restore_idx); + gpu_slot_dims, d_restore_idx, feature_value_size); } else { size_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); @@ -1097,7 +1060,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, accessor_wrapper_ptr->CopyForPull( place, gpu_keys, values, total_values_gpu, gpu_len, static_cast(slot_lengths.size()), hidden_size, total_length, - gpu_dim, val_type_size_); + gpu_dim, feature_value_size); } pull_gpups_timer.Pause(); @@ -1151,7 +1114,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, accessor_wrapper_ptr->CopyForPull( place, xpu_keys, values, total_values_gpu, xpu_len, static_cast(slot_lengths.size()), hidden_size, total_length, - val_type_size_); + feature_value_size); #endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -1221,7 +1184,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, if (total_length > dedup_size * 3) { const uint32_t* d_restore_idx = reinterpret_cast(&key2slot[total_length]); - this->CopyForPush(place, total_keys, gpu_values, total_grad_values_gpu, + accessor_wrapper_ptr->CopyForPush(place, total_keys, gpu_values, total_grad_values_gpu, d_slot_vector, slot_lens, max_mf_dim_ + 3, total_length, dedup_size, batch_size, slot_dims, key2slot, d_restore_idx, grad_value_size); @@ -1232,7 +1195,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, reinterpret_cast(&d_sorted_idx[total_length]); const uint32_t* d_merged_cnts = reinterpret_cast(&d_offset[total_length]); - this->CopyForPush(place, d_merged_keys, gpu_values, + accessor_wrapper_ptr->CopyForPush(place, d_merged_keys, gpu_values, total_grad_values_gpu, d_slot_vector, slot_lens, max_mf_dim_ + 3, total_length, dedup_size, batch_size, slot_dims, key2slot, d_sorted_idx, d_offset, @@ -1309,6 +1272,6 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, VLOG(3) << "End PushSparseGrad"; } -} // end namespace framework +} // namespace framework } // end namespace paddle #endif From be9157120d4387582481f2dcd1bc2325a9c05162 Mon Sep 17 00:00:00 2001 From: danleifeng Date: Fri, 22 Jul 2022 16:44:46 +0800 Subject: [PATCH 12/12] format --- .../framework/fleet/heter_ps/heter_comm_inl.h | 1235 ++++++++++++----- .../fleet/heter_ps/heter_comm_kernel.h | 5 - .../framework/fleet/heter_ps/heter_ps.cc | 22 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 478 ++++--- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 2 +- 5 files changed, 1197 insertions(+), 545 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index d931eace59a32f..3ce8a315af9fb2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -32,7 +32,9 @@ DECLARE_int32(gpugraph_dedup_pull_push_mode); namespace paddle { namespace framework { -template HeterComm::HeterComm( size_t capacity, std::shared_ptr resource) { @@ -78,7 +80,9 @@ HeterComm::HeterComm( init_path(); } -template void HeterComm::init_path() { int total_device = resource_->total_device(); @@ -132,12 +136,18 @@ void HeterComm::init_path() { } } -template template void HeterComm::memory_copy( - DstPlace dst_place, void* dst, SrcPlace src_place, const void* src, - size_t count, StreamType stream) { + DstPlace dst_place, + void* dst, + SrcPlace src_place, + const void* src, + size_t count, + StreamType stream) { #if defined(PADDLE_WITH_CUDA) CUDA_CHECK(cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream)); if (stream == 0) { @@ -148,7 +158,9 @@ void HeterComm::memory_copy( #endif } -template void HeterComm::create_storage( int start_index, int end_index, size_t keylen, size_t vallen) { @@ -160,11 +172,13 @@ void HeterComm::create_storage( PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate( resource_->dev_id(nodes[i].dev_num), (void**)&(nodes[i].key_storage), // NOLINT - keylen, resource_->remote_stream(nodes[i].dev_num, start_index))); + keylen, + resource_->remote_stream(nodes[i].dev_num, start_index))); PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate( resource_->dev_id(nodes[i].dev_num), (void**)&(nodes[i].val_storage), // NOLINT - vallen, resource_->remote_stream(nodes[i].dev_num, start_index))); + vallen, + resource_->remote_stream(nodes[i].dev_num, start_index))); nodes[i].key_bytes_len = keylen; nodes[i].val_bytes_len = vallen; } @@ -183,7 +197,9 @@ void HeterComm::create_storage( #endif } -template void HeterComm::destroy_storage( int start_index, int end_index) { @@ -201,10 +217,16 @@ void HeterComm::destroy_storage( #endif } -template void HeterComm::walk_to_dest( - int start_index, int num, int* h_left, int* h_right, KeyType* src_key, + int start_index, + int num, + int* h_left, + int* h_right, + KeyType* src_key, GradType* src_val) { int need_copy_val = 0; if (src_val) { @@ -225,18 +247,24 @@ void HeterComm::walk_to_dest( auto src_place = DevPlace(src_dev_id); auto dst_place = DevPlace(dst_dev_id); - memory_copy(dst_place, node.key_storage, src_place, + memory_copy(dst_place, + node.key_storage, + src_place, reinterpret_cast(src_key + h_left[i]), - node.key_bytes_len, node.in_stream); + node.key_bytes_len, + node.in_stream); // #if defined(PADDLE_WITH_CUDA) // adapt for gpu-graph // cudaMemsetAsync(node.val_storage, -1, node.val_bytes_len, // node.in_stream); // #endif if (need_copy_val) { - memory_copy(dst_place, node.val_storage, src_place, + memory_copy(dst_place, + node.val_storage, + src_place, reinterpret_cast(src_val + h_left[i]), - node.val_bytes_len, node.in_stream); + node.val_bytes_len, + node.in_stream); } } while (!que.empty()) { @@ -258,13 +286,17 @@ void HeterComm::walk_to_dest( auto src_place = DevPlace(src_dev_id); auto dst_place = DevPlace(dst_dev_id); - memory_copy(dst_place, cur_task.path->nodes_[cur_step + 1].key_storage, - src_place, cur_task.path->nodes_[cur_step].key_storage, + memory_copy(dst_place, + cur_task.path->nodes_[cur_step + 1].key_storage, + src_place, + cur_task.path->nodes_[cur_step].key_storage, cur_task.path->nodes_[cur_step + 1].key_bytes_len, cur_task.path->nodes_[cur_step + 1].in_stream); if (need_copy_val) { - memory_copy(dst_place, cur_task.path->nodes_[cur_step + 1].val_storage, - src_place, cur_task.path->nodes_[cur_step].val_storage, + memory_copy(dst_place, + cur_task.path->nodes_[cur_step + 1].val_storage, + src_place, + cur_task.path->nodes_[cur_step].val_storage, cur_task.path->nodes_[cur_step + 1].val_bytes_len, cur_task.path->nodes_[cur_step + 1].in_stream); } @@ -272,11 +304,18 @@ void HeterComm::walk_to_dest( } } -template void HeterComm::walk_to_dest( - int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key, - char* src_val, size_t val_size) { + int start_index, + int gpu_num, + int* h_left, + int* h_right, + KeyType* src_key, + char* src_val, + size_t val_size) { int need_copy_val = 0; if (src_val) { need_copy_val = 1; @@ -290,13 +329,18 @@ void HeterComm::walk_to_dest( auto& node = path_[start_index][i].nodes_[0]; CopyTask t(&path_[start_index][i], 0); que.push(t); - CUDA_CHECK(cudaMemcpyAsync( - node.key_storage, reinterpret_cast(src_key + h_left[i]), - node.key_bytes_len, cudaMemcpyDefault, node.in_stream)); + CUDA_CHECK(cudaMemcpyAsync(node.key_storage, + reinterpret_cast(src_key + h_left[i]), + node.key_bytes_len, + cudaMemcpyDefault, + node.in_stream)); if (need_copy_val) { - CUDA_CHECK(cudaMemcpyAsync( - node.val_storage, src_val + uint64_t(h_left[i]) * uint64_t(val_size), - node.val_bytes_len, cudaMemcpyDefault, node.in_stream)); + CUDA_CHECK( + cudaMemcpyAsync(node.val_storage, + src_val + uint64_t(h_left[i]) * uint64_t(val_size), + node.val_bytes_len, + cudaMemcpyDefault, + node.in_stream)); } } while (!que.empty()) { @@ -310,26 +354,34 @@ void HeterComm::walk_to_dest( int cur_step = cur_task.step; CopyTask c(cur_task.path, cur_step + 1); que.push(c); - CUDA_CHECK(cudaMemcpyAsync( - cur_task.path->nodes_[cur_step + 1].key_storage, - cur_task.path->nodes_[cur_step].key_storage, - cur_task.path->nodes_[cur_step + 1].key_bytes_len, cudaMemcpyDefault, - cur_task.path->nodes_[cur_step + 1].in_stream)); + CUDA_CHECK( + cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage, + cur_task.path->nodes_[cur_step].key_storage, + cur_task.path->nodes_[cur_step + 1].key_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step + 1].in_stream)); if (need_copy_val) { - CUDA_CHECK(cudaMemcpyAsync( - cur_task.path->nodes_[cur_step + 1].val_storage, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step + 1].val_bytes_len, - cudaMemcpyDefault, cur_task.path->nodes_[cur_step + 1].in_stream)); + CUDA_CHECK( + cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step + 1].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step + 1].in_stream)); } } } } -template void HeterComm::walk_to_src( - int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, + int start_index, + int gpu_num, + int* h_left, + int* h_right, + char* src_val, size_t val_size) { std::queue que; for (int i = 0; i < gpu_num; i++) { @@ -340,8 +392,10 @@ void HeterComm::walk_to_src( auto& node = path_[start_index][i].nodes_[cur_step]; if (cur_step == 0) { CUDA_CHECK(cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size, - node.val_storage, node.val_bytes_len, - cudaMemcpyDefault, node.out_stream)); + node.val_storage, + node.val_bytes_len, + cudaMemcpyDefault, + node.out_stream)); } else { CopyTask t(&path_[start_index][i], cur_step - 1); que.push(t); @@ -363,23 +417,27 @@ void HeterComm::walk_to_src( if (cur_step > 0) { CopyTask c(cur_task.path, cur_step - 1); que.push(c); - CUDA_CHECK(cudaMemcpyAsync( - cur_task.path->nodes_[cur_step - 1].val_storage, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step - 1].val_bytes_len, cudaMemcpyDefault, - cur_task.path->nodes_[cur_step - 1].out_stream)); + CUDA_CHECK( + cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step - 1].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step - 1].out_stream)); } else if (cur_step == 0) { int end_index = cur_task.path->nodes_.back().dev_num; - CUDA_CHECK(cudaMemcpyAsync( - src_val + uint64_t(h_left[end_index]) * val_size, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step].val_bytes_len, cudaMemcpyDefault, - cur_task.path->nodes_[cur_step].out_stream)); + CUDA_CHECK( + cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step].out_stream)); } } } -template HeterComm::~HeterComm() { if (!multi_mf_dim_) { @@ -399,7 +457,9 @@ HeterComm::~HeterComm() { } } -template void HeterComm::show_one_table( int gpu_num) { @@ -408,10 +468,12 @@ void HeterComm::show_one_table( } } -template -void HeterComm::show_table_collisions() { +void HeterComm:: + show_table_collisions() { size_t idx = 0; for (auto& table : tables_) { if (table != nullptr) { @@ -426,7 +488,9 @@ void HeterComm int HeterComm::log2i(int x) { unsigned res = 0; @@ -436,14 +500,18 @@ int HeterComm::log2i(int x) { return res; } -template int HeterComm::get_index_by_devid( int devid) { return resource_->get_index_by_devid(devid); } -template void HeterComm::set_sparse_sgd( const OptimizerConfig& optimizer_config) { @@ -453,7 +521,9 @@ void HeterComm::set_sparse_sgd( } } -template void HeterComm::set_embedx_sgd( const OptimizerConfig& optimizer_config) { @@ -463,11 +533,18 @@ void HeterComm::set_embedx_sgd( } } -template void HeterComm::build_ps( - int dev_num, KeyType* h_keys, ValType* h_vals, size_t len, - size_t chunk_size, int stream_num, int offset) { + int dev_num, + KeyType* h_keys, + ValType* h_vals, + size_t len, + size_t chunk_size, + int stream_num, + int offset) { if (len <= 0) { return; } @@ -502,17 +579,24 @@ void HeterComm::build_ps( auto dst_place = place; auto src_place = platform::CPUPlace(); - memory_copy( - dst_place, reinterpret_cast(d_key_bufs[cur_stream]->ptr()), - src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream); - memory_copy( - dst_place, reinterpret_cast(d_val_bufs[cur_stream]->ptr()), - src_place, h_vals + cur_len, sizeof(ValType) * tmp_len, cur_use_stream); + memory_copy(dst_place, + reinterpret_cast(d_key_bufs[cur_stream]->ptr()), + src_place, + h_keys + cur_len, + sizeof(KeyType) * tmp_len, + cur_use_stream); + memory_copy(dst_place, + reinterpret_cast(d_val_bufs[cur_stream]->ptr()), + src_place, + h_vals + cur_len, + sizeof(ValType) * tmp_len, + cur_use_stream); if (offset == -1) offset = dev_num; tables_[offset]->insert( reinterpret_cast(d_key_bufs[cur_stream]->ptr()), reinterpret_cast(d_val_bufs[cur_stream]->ptr()), - (size_t)tmp_len, cur_use_stream); + (size_t)tmp_len, + cur_use_stream); cur_stream += 1; cur_len += tmp_len; @@ -523,11 +607,18 @@ void HeterComm::build_ps( } } -template void HeterComm::build_ps( - int num, KeyType* h_keys, char* pool, size_t len, size_t feature_value_size, - size_t chunk_size, int stream_num) { + int num, + KeyType* h_keys, + char* pool, + size_t len, + size_t feature_value_size, + size_t chunk_size, + int stream_num) { if (len <= 0) { return; } @@ -560,12 +651,19 @@ void HeterComm::build_ps( auto dst_place = place; auto src_place = platform::CPUPlace(); - memory_copy( - dst_place, reinterpret_cast(d_key_bufs[cur_stream]->ptr()), - src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream); + memory_copy(dst_place, + reinterpret_cast(d_key_bufs[cur_stream]->ptr()), + src_place, + h_keys + cur_len, + sizeof(KeyType) * tmp_len, + cur_use_stream); ptr_tables_[num]->insert( - reinterpret_cast(d_key_bufs[cur_stream]->ptr()), tmp_len, - pool, feature_value_size, cur_len, cur_use_stream); + reinterpret_cast(d_key_bufs[cur_stream]->ptr()), + tmp_len, + pool, + feature_value_size, + cur_len, + cur_use_stream); cur_stream += 1; cur_len += tmp_len; } @@ -575,10 +673,15 @@ void HeterComm::build_ps( } } -template void HeterComm::merge_grad( - int dev_num, KeyType* d_keys, GradType* d_grads, size_t len, + int dev_num, + KeyType* d_keys, + GradType* d_grads, + size_t len, int& uniq_len) { // NOLINT int dev_id = resource_->dev_id(dev_num); DevPlace place = DevPlace(dev_id); @@ -590,38 +693,75 @@ void HeterComm::merge_grad( auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType)); GradType* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); - heter_comm_kernel_->sort_pairs(NULL, temp_storage_bytes, d_keys, - d_merge_keys_ptr, d_grads, d_merge_grads_ptr, - len, 0, 8 * sizeof(KeyType), stream, false); + heter_comm_kernel_->sort_pairs(NULL, + temp_storage_bytes, + d_keys, + d_merge_keys_ptr, + d_grads, + d_merge_grads_ptr, + len, + 0, + 8 * sizeof(KeyType), + stream, + false); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - heter_comm_kernel_->sort_pairs( - d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, - d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false); + heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(), + temp_storage_bytes, + d_keys, + d_merge_keys_ptr, + d_grads, + d_merge_grads_ptr, + len, + 0, + 8 * sizeof(KeyType), + stream, + false); temp_storage_bytes = 0; auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int)); int* d_num_runs_out = reinterpret_cast(d_num_runs_out_mem->ptr()); - heter_comm_kernel_->reduce_by_key(NULL, temp_storage_bytes, d_merge_keys_ptr, - d_keys, d_merge_grads_ptr, d_grads, - d_num_runs_out, len, stream, false); + heter_comm_kernel_->reduce_by_key(NULL, + temp_storage_bytes, + d_merge_keys_ptr, + d_keys, + d_merge_grads_ptr, + d_grads, + d_num_runs_out, + len, + stream, + false); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - heter_comm_kernel_->reduce_by_key( - d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, - d_merge_grads_ptr, d_grads, d_num_runs_out, len, stream, false); + heter_comm_kernel_->reduce_by_key(d_temp_storage->ptr(), + temp_storage_bytes, + d_merge_keys_ptr, + d_keys, + d_merge_grads_ptr, + d_grads, + d_num_runs_out, + len, + stream, + false); auto dst_place = platform::CPUPlace(); auto src_place = place; - memory_copy(dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int), - stream); + memory_copy( + dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int), stream); sync_stream(stream); } -template void HeterComm::dynamic_merge_grad( - int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len, - size_t& segment_len, bool enable_segment_merge_grad) { + int gpu_num, + KeyType* d_keys, + float* d_grads, + size_t len, + int& uniq_len, + size_t& segment_len, + bool enable_segment_merge_grad) { int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); @@ -643,69 +783,127 @@ void HeterComm::dynamic_merge_grad( int* d_merged_size = (int*)&d_idx[len]; heter_comm_kernel_->fill_idx(d_idx, len, stream); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len, - 0, 8 * sizeof(KeyType), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(NULL, + temp_storage_bytes, + d_keys, + d_merge_keys_ptr, + d_idx, + d_index, + len, + 0, + 8 * sizeof(KeyType), + stream)); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, - d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), + temp_storage_bytes, + d_keys, + d_merge_keys_ptr, + d_idx, + d_index, + len, + 0, + 8 * sizeof(KeyType), + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_fea_num_info_ptr, - d_merged_size, len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(NULL, + temp_storage_bytes, + d_merge_keys_ptr, + d_keys, + d_fea_num_info_ptr, + d_merged_size, + len, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, - d_fea_num_info_ptr, d_merged_size, len, stream)); - - cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int), - cudaMemcpyDeviceToHost, stream); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(d_temp_storage->ptr(), + temp_storage_bytes, + d_merge_keys_ptr, + d_keys, + d_fea_num_info_ptr, + d_merged_size, + len, + stream)); + + cudaMemcpyAsync((void*)&uniq_len, + d_merged_size, + sizeof(int), + cudaMemcpyDeviceToHost, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); assert(d_merged_size > 0); uint32_t* d_offset = (uint32_t*)&d_index[len]; temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len, - stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL, + temp_storage_bytes, + d_fea_num_info_ptr, + d_offset, + uniq_len, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, - uniq_len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + d_fea_num_info_ptr, + d_offset, + uniq_len, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); if (enable_segment_merge_grad) { - segment_merge_grad(gpu_num, d_merge_keys_ptr, d_grads, d_index, len, - d_fea_num_info_ptr, uniq_len, segment_len); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpyAsync(d_keys, d_merge_keys_ptr, sizeof(KeyType) * segment_len, - cudaMemcpyDeviceToDevice, stream)); + segment_merge_grad(gpu_num, + d_merge_keys_ptr, + d_grads, + d_index, + len, + d_fea_num_info_ptr, + uniq_len, + segment_len); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys, + d_merge_keys_ptr, + sizeof(KeyType) * segment_len, + cudaMemcpyDeviceToDevice, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } else { auto d_merge_grads = memory::Alloc(place, len * grad_value_size); float* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); - heter_comm_kernel_->merge_gradient( - d_keys, d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads, - (char*)d_merge_grads_ptr, uniq_len, grad_dim, grad_value_size, merger_, - stream, feature_value_accessor_); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len, - cudaMemcpyDeviceToDevice, stream)); + heter_comm_kernel_->merge_gradient(d_keys, + d_offset, + d_fea_num_info_ptr, + d_index, + (char*)d_grads, + (char*)d_merge_grads_ptr, + uniq_len, + grad_dim, + grad_value_size, + merger_, + stream, + feature_value_accessor_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, + d_merge_grads_ptr, + grad_value_size * uniq_len, + cudaMemcpyDeviceToDevice, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } } -template void HeterComm::segment_merge_grad( int gpu_num, // the device number @@ -744,78 +942,127 @@ void HeterComm::segment_merge_grad( CUDA_CHECK(cudaMemsetAsync(d_segments_num, 0, sizeof(uint32_t), stream)); uint32_t segment_size = FLAGS_gpugraph_merge_grads_segment_size; - heter_comm_kernel_->split_segments(d_fea_num_info, uniq_len, d_segments, - d_segments_num, segment_size, stream); + heter_comm_kernel_->split_segments(d_fea_num_info, + uniq_len, + d_segments, + d_segments_num, + segment_size, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); size_t temp_storage_bytes = 0; PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum( NULL, temp_storage_bytes, d_segments, d_segments_num, uniq_len, stream)); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS( - cub::DeviceReduce::Sum(d_temp_storage->ptr(), temp_storage_bytes, - d_segments, d_segments_num, uniq_len, stream)); - CUDA_CHECK(cudaMemcpyAsync(&segments_num, d_segments_num, sizeof(uint32_t), - cudaMemcpyDeviceToHost, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum(d_temp_storage->ptr(), + temp_storage_bytes, + d_segments, + d_segments_num, + uniq_len, + stream)); + CUDA_CHECK(cudaMemcpyAsync(&segments_num, + d_segments_num, + sizeof(uint32_t), + cudaMemcpyDeviceToHost, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - cub::DeviceScan::ExclusiveSum(NULL, temp_storage_bytes, d_segments, - d_segments_offset, uniq_len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL, + temp_storage_bytes, + d_segments, + d_segments_offset, + uniq_len, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - d_temp_storage->ptr(), temp_storage_bytes, d_segments, d_segments_offset, - uniq_len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + d_segments, + d_segments_offset, + uniq_len, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - heter_comm_kernel_->expand_segments(d_fea_num_info, d_segments_offset, - uniq_len, d_segments_fea_num_info, - segment_size, stream); + heter_comm_kernel_->expand_segments(d_fea_num_info, + d_segments_offset, + uniq_len, + d_segments_fea_num_info, + segment_size, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - NULL, temp_storage_bytes, d_segments_fea_num_info, - d_segments_fea_num_offset, segments_num, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::ExclusiveSum(NULL, + temp_storage_bytes, + d_segments_fea_num_info, + d_segments_fea_num_offset, + segments_num, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - d_temp_storage->ptr(), temp_storage_bytes, d_segments_fea_num_info, - d_segments_fea_num_offset, segments_num, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + d_segments_fea_num_info, + d_segments_fea_num_offset, + segments_num, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); auto d_segments_keys = memory::Alloc(place, sizeof(KeyType) * segments_num); auto d_segments_keys_ptr = reinterpret_cast(d_segments_keys->ptr()); - heter_comm_kernel_->shrink_keys(d_keys, d_segments_fea_num_offset, - d_segments_keys_ptr, segments_num, stream); + heter_comm_kernel_->shrink_keys(d_keys, + d_segments_fea_num_offset, + d_segments_keys_ptr, + segments_num, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); auto d_segment_grads = memory::Alloc(place, segments_num * grad_value_size); auto d_segment_grads_ptr = reinterpret_cast(d_segment_grads->ptr()); - heter_comm_kernel_->merge_gradient( - d_segments_keys_ptr, d_segments_fea_num_offset, d_segments_fea_num_info, - d_index, (char*)d_grads, (char*)d_segment_grads_ptr, segments_num, - grad_dim, grad_value_size, merger_, stream, feature_value_accessor_); + heter_comm_kernel_->merge_gradient(d_segments_keys_ptr, + d_segments_fea_num_offset, + d_segments_fea_num_info, + d_index, + (char*)d_grads, + (char*)d_segment_grads_ptr, + segments_num, + grad_dim, + grad_value_size, + merger_, + stream, + feature_value_accessor_); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys, d_segments_keys_ptr, + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys, + d_segments_keys_ptr, sizeof(KeyType) * segments_num, - cudaMemcpyDeviceToDevice, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_segment_grads_ptr, + cudaMemcpyDeviceToDevice, + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, + d_segment_grads_ptr, grad_value_size * segments_num, - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } -template void HeterComm::split_input_to_shard( - KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right, + KeyType* d_keys, + int* d_idx_ptr, + size_t len, + int* left, + int* right, int dev_num) { int total_device = resource_->total_device(); int dev_id = resource_->dev_id(dev_num); @@ -833,33 +1080,51 @@ void HeterComm::split_input_to_shard( int* d_shard_index_tmp_ptr = reinterpret_cast(d_shard_index_tmp->ptr()); heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream); - heter_comm_kernel_->calc_shard_index(d_keys, len, d_shard_index_tmp_ptr, - total_device, stream); + heter_comm_kernel_->calc_shard_index( + d_keys, len, d_shard_index_tmp_ptr, total_device, stream); size_t temp_storage_bytes; const int num_bits = 1 + log2i(total_device); - heter_comm_kernel_->sort_pairs( - NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, - d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream); + heter_comm_kernel_->sort_pairs(NULL, + temp_storage_bytes, + d_shard_index_tmp_ptr, + d_shard_index_ptr, + d_idx_tmp_ptr, + d_idx_ptr, + len, + 0, + num_bits, + stream); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - heter_comm_kernel_->sort_pairs( - d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr, - d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream); - - heter_comm_kernel_->calc_shard_offset(d_shard_index_ptr, left, right, len, - total_device, stream); + heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(), + temp_storage_bytes, + d_shard_index_tmp_ptr, + d_shard_index_ptr, + d_idx_tmp_ptr, + d_idx_ptr, + len, + 0, + num_bits, + stream); + + heter_comm_kernel_->calc_shard_offset( + d_shard_index_ptr, left, right, len, total_device, stream); sync_stream(stream); } -template void HeterComm::merge_keys( - int gpu_num, const KeyType* d_keys, size_t len, // input - KeyType* d_sorted_keys, // output - KeyType* d_merged_keys, // output - uint32_t* d_restore_idx, // output - size_t& uniq_len) { // output + int gpu_num, + const KeyType* d_keys, + size_t len, // input + KeyType* d_sorted_keys, // output + KeyType* d_merged_keys, // output + uint32_t* d_restore_idx, // output + size_t& uniq_len) { // output int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); @@ -880,50 +1145,96 @@ void HeterComm::merge_keys( heter_comm_kernel_->fill_idx(d_idx, len, stream); size_t temp_storage_bytes; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, d_keys, d_sorted_keys, d_idx, d_index, len, 0, - 8 * sizeof(KeyType), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(NULL, + temp_storage_bytes, + d_keys, + d_sorted_keys, + d_idx, + d_index, + len, + 0, + 8 * sizeof(KeyType), + stream)); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_sorted_keys, d_idx, - d_index, len, 0, 8 * sizeof(KeyType), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), + temp_storage_bytes, + d_keys, + d_sorted_keys, + d_idx, + d_index, + len, + 0, + 8 * sizeof(KeyType), + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - NULL, temp_storage_bytes, d_sorted_keys, d_merged_keys, - d_fea_num_info_ptr, d_merged_size, len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(NULL, + temp_storage_bytes, + d_sorted_keys, + d_merged_keys, + d_fea_num_info_ptr, + d_merged_size, + len, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - d_temp_storage->ptr(), temp_storage_bytes, d_sorted_keys, d_merged_keys, - d_fea_num_info_ptr, d_merged_size, len, stream)); - cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int), - cudaMemcpyDeviceToHost, stream); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(d_temp_storage->ptr(), + temp_storage_bytes, + d_sorted_keys, + d_merged_keys, + d_fea_num_info_ptr, + d_merged_size, + len, + stream)); + cudaMemcpyAsync((void*)&uniq_len, + d_merged_size, + sizeof(int), + cudaMemcpyDeviceToHost, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len, - stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL, + temp_storage_bytes, + d_fea_num_info_ptr, + d_offset, + uniq_len, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, - uniq_len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + d_fea_num_info_ptr, + d_offset, + uniq_len, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - heter_comm_kernel_->fill_restore_idx(true, len, uniq_len, d_merged_keys, - d_index, d_offset, d_fea_num_info_ptr, - d_restore_idx, stream); + heter_comm_kernel_->fill_restore_idx(true, + len, + uniq_len, + d_merged_keys, + d_index, + d_offset, + d_fea_num_info_ptr, + d_restore_idx, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } -template void HeterComm::pull_merge_sparse( int num, KeyType* d_keys, float* d_vals, size_t len) { @@ -951,14 +1262,18 @@ void HeterComm::pull_merge_sparse( auto xpu_context = xpu_dev_ctx.x_context(); int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r, + "XPU constant kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r2, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r2, + "XPU constant kernel return wrong value[%d %s]", + r2, XPUAPIErrorMsg[r2])); #endif @@ -978,25 +1293,38 @@ void HeterComm::pull_merge_sparse( auto d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); size_t uniq_len = 0; - merge_keys(num, d_keys, len, d_sorted_keys_ptr, d_merged_keys_ptr, - d_restore_idx_ptr, uniq_len); + merge_keys(num, + d_keys, + len, + d_sorted_keys_ptr, + d_merged_keys_ptr, + d_restore_idx_ptr, + uniq_len); sync_stream(stream); auto d_idx = memory::Alloc(place, uniq_len * sizeof(int)); auto d_idx_ptr = reinterpret_cast(d_idx->ptr()); - split_input_to_shard(d_merged_keys_ptr, d_idx_ptr, uniq_len, d_left_ptr, - d_right_ptr, num); - heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, d_merged_keys_ptr, - d_idx_ptr, uniq_len, stream); + split_input_to_shard( + d_merged_keys_ptr, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, num); + heter_comm_kernel_->fill_shard_key( + d_shard_keys_ptr, d_merged_keys_ptr, d_idx_ptr, uniq_len, stream); sync_stream(stream); auto dst_place = platform::CPUPlace(); auto src_place = place; - memory_copy(dst_place, h_left, src_place, d_left_ptr, - total_device * sizeof(int), stream); - memory_copy(dst_place, h_right, src_place, d_right_ptr, - total_device * sizeof(int), stream); + memory_copy(dst_place, + h_left, + src_place, + d_left_ptr, + total_device * sizeof(int), + stream); + memory_copy(dst_place, + h_right, + src_place, + d_right_ptr, + total_device * sizeof(int), + stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { for (int i = 0; i < total_device; ++i) { @@ -1004,8 +1332,8 @@ void HeterComm::pull_merge_sparse( if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(num, i, shard_len * sizeof(KeyType), - shard_len * val_type_size); + create_storage( + num, i, shard_len * sizeof(KeyType), shard_len * val_type_size); } walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL); } @@ -1022,14 +1350,16 @@ void HeterComm::pull_merge_sparse( ptr_tables_[i]->rwlock_->RDLock(); if (!FLAGS_gpugraph_enable_gpu_direct_access) { ptr_tables_[i]->get(reinterpret_cast(node.key_storage), - node.val_storage, h_right[i] - h_left[i] + 1, + node.val_storage, + h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num), feature_value_accessor_); } else { ptr_tables_[i]->get( d_shard_keys_ptr + h_left[i], reinterpret_cast(d_shard_vals_ptr) + h_left[i] * val_type_size, - h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num), + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, num), feature_value_accessor_); } } @@ -1043,8 +1373,12 @@ void HeterComm::pull_merge_sparse( } if (!FLAGS_gpugraph_enable_gpu_direct_access) { - walk_to_src(num, total_device, h_left, h_right, - reinterpret_cast(d_shard_vals_ptr), val_type_size); + walk_to_src(num, + total_device, + h_left, + h_right, + reinterpret_cast(d_shard_vals_ptr), + val_type_size); for (int i = 0; i < total_device; ++i) { auto& node = path_[num][i].nodes_.front(); sync_stream(node.out_stream); @@ -1053,14 +1387,21 @@ void HeterComm::pull_merge_sparse( auto d_merged_vals = memory::Alloc(place, uniq_len * val_type_size); auto d_merged_vals_ptr = reinterpret_cast(d_merged_vals->ptr()); - heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, d_merged_vals_ptr, - d_idx_ptr, uniq_len, val_type_size, + heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, + d_merged_vals_ptr, + d_idx_ptr, + uniq_len, + val_type_size, stream); sync_stream(stream); - heter_comm_kernel_->unpack_merged_vals(len, d_keys, d_merged_vals_ptr, - d_restore_idx_ptr, d_vals, - val_type_size, stream); + heter_comm_kernel_->unpack_merged_vals(len, + d_keys, + d_merged_vals_ptr, + d_restore_idx_ptr, + d_vals, + val_type_size, + stream); sync_stream(stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { @@ -1072,7 +1413,9 @@ void HeterComm::pull_merge_sparse( } } } -template void HeterComm::pull_normal_sparse( int num, KeyType* d_keys, float* d_vals, size_t len) { @@ -1100,14 +1443,18 @@ void HeterComm::pull_normal_sparse( auto xpu_context = xpu_dev_ctx.x_context(); int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r, + "XPU constant kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r2, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r2, + "XPU constant kernel return wrong value[%d %s]", + r2, XPUAPIErrorMsg[r2])); #endif @@ -1125,18 +1472,26 @@ void HeterComm::pull_normal_sparse( split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num); - heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, d_keys, d_idx_ptr, len, - stream); + heter_comm_kernel_->fill_shard_key( + d_shard_keys_ptr, d_keys, d_idx_ptr, len, stream); sync_stream(stream); auto dst_place = platform::CPUPlace(); auto src_place = place; - memory_copy(dst_place, h_left, src_place, d_left_ptr, - total_device * sizeof(int), stream); - memory_copy(dst_place, h_right, src_place, d_right_ptr, - total_device * sizeof(int), stream); + memory_copy(dst_place, + h_left, + src_place, + d_left_ptr, + total_device * sizeof(int), + stream); + memory_copy(dst_place, + h_right, + src_place, + d_right_ptr, + total_device * sizeof(int), + stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { for (int i = 0; i < total_device; ++i) { @@ -1144,8 +1499,8 @@ void HeterComm::pull_normal_sparse( if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(num, i, shard_len * sizeof(KeyType), - shard_len * val_type_size); + create_storage( + num, i, shard_len * sizeof(KeyType), shard_len * val_type_size); } walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL); } @@ -1161,14 +1516,16 @@ void HeterComm::pull_normal_sparse( ptr_tables_[i]->rwlock_->RDLock(); if (!FLAGS_gpugraph_enable_gpu_direct_access) { ptr_tables_[i]->get(reinterpret_cast(node.key_storage), - node.val_storage, h_right[i] - h_left[i] + 1, + node.val_storage, + h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num), feature_value_accessor_); } else { ptr_tables_[i]->get( d_shard_keys_ptr + h_left[i], reinterpret_cast(d_shard_vals_ptr) + h_left[i] * val_type_size, - h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num), + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, num), feature_value_accessor_); } } @@ -1181,16 +1538,20 @@ void HeterComm::pull_normal_sparse( ptr_tables_[i]->rwlock_->UNLock(); } if (!FLAGS_gpugraph_enable_gpu_direct_access) { - walk_to_src(num, total_device, h_left, h_right, - reinterpret_cast(d_shard_vals_ptr), val_type_size); + walk_to_src(num, + total_device, + h_left, + h_right, + reinterpret_cast(d_shard_vals_ptr), + val_type_size); for (int i = 0; i < total_device; ++i) { auto& node = path_[num][i].nodes_.front(); sync_stream(node.out_stream); } } - heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len, - val_type_size, stream); + heter_comm_kernel_->dy_mf_fill_dvals( + d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size, stream); sync_stream(stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { for (int i = 0; i < total_device; ++i) { @@ -1202,12 +1563,12 @@ void HeterComm::pull_normal_sparse( } } -template -void HeterComm::pull_sparse(int num, - KeyType* d_keys, - float* d_vals, - size_t len) { +void HeterComm::pull_sparse( + int num, KeyType* d_keys, float* d_vals, size_t len) { if (len == 0) { return; } @@ -1219,11 +1580,16 @@ void HeterComm::pull_sparse(int num, } #if defined(PADDLE_WITH_CUDA) -template template void HeterComm::push_sparse( - int dev_num, KeyType* d_keys, float* d_grads, size_t len, + int dev_num, + KeyType* d_keys, + float* d_grads, + size_t len, Sgd& sgd) { // NOLINT if (len == 0) { return; @@ -1257,14 +1623,18 @@ void HeterComm::push_sparse( auto xpu_context = xpu_dev_ctx.x_context(); int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r, + "XPU constant kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r2, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r2, + "XPU constant kernel return wrong value[%d %s]", + r2, XPUAPIErrorMsg[r2])); #endif @@ -1285,35 +1655,49 @@ void HeterComm::push_sparse( // do two gradient merge // 1st. do segmented gradient merge // 2nd. do global gradient merge - dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len, - true); + dynamic_merge_grad( + dev_num, d_keys, d_grads, len, uniq_len, segment_len, true); len = segment_len; uniq_len = 0; segment_len = 0; - dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len, - false); + dynamic_merge_grad( + dev_num, d_keys, d_grads, len, uniq_len, segment_len, false); } else { // Perform gradient merge only once - dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len, - false); + dynamic_merge_grad( + dev_num, d_keys, d_grads, len, uniq_len, segment_len, false); } } - split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, - dev_num); + split_input_to_shard( + d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); - heter_comm_kernel_->dy_mf_fill_shard_grads( - d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, uniq_len, - grad_value_size, stream, feature_value_accessor_); + heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr, + d_keys, + d_shard_grads_ptr, + d_grads, + d_idx_ptr, + uniq_len, + grad_value_size, + stream, + feature_value_accessor_); sync_stream(stream); auto dst_place = platform::CPUPlace(); auto src_place = place; - memory_copy(dst_place, h_left, src_place, d_left_ptr, - total_device * sizeof(int), stream); - memory_copy(dst_place, h_right, src_place, d_right_ptr, - total_device * sizeof(int), stream); + memory_copy(dst_place, + h_left, + src_place, + d_left_ptr, + total_device * sizeof(int), + stream); + memory_copy(dst_place, + h_right, + src_place, + d_right_ptr, + total_device * sizeof(int), + stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { for (int i = 0; i < total_device; ++i) { @@ -1321,12 +1705,17 @@ void HeterComm::push_sparse( if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(dev_num, i, shard_len * sizeof(KeyType), - shard_len * grad_value_size); + create_storage( + dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size); } - walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr, - reinterpret_cast(d_shard_grads_ptr), grad_value_size); + walk_to_dest(dev_num, + total_device, + h_left, + h_right, + d_shard_keys_ptr, + reinterpret_cast(d_shard_grads_ptr), + grad_value_size); } for (int i = 0; i < total_device; ++i) { @@ -1342,13 +1731,16 @@ void HeterComm::push_sparse( ptr_tables_[i]->rwlock_->WRLock(); if (!FLAGS_gpugraph_enable_gpu_direct_access) { ptr_tables_[i]->update(reinterpret_cast(node.key_storage), - node.val_storage, h_right[i] - h_left[i] + 1, sgd, + node.val_storage, + h_right[i] - h_left[i] + 1, + sgd, resource_->remote_stream(i, dev_num)); } else { ptr_tables_[i]->update(d_shard_keys_ptr + h_left[i], reinterpret_cast(d_shard_grads_ptr) + grad_value_size * h_left[i], - h_right[i] - h_left[i] + 1, sgd, + h_right[i] - h_left[i] + 1, + sgd, resource_->remote_stream(i, dev_num)); } } @@ -1375,7 +1767,9 @@ void HeterComm::push_sparse( } #elif defined(PADDLE_WITH_XPU_KP) -template void HeterComm::push_sparse( int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) { @@ -1408,14 +1802,18 @@ void HeterComm::push_sparse( auto xpu_context = xpu_dev_ctx.x_context(); int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r, + "XPU constant kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r2, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r2, + "XPU constant kernel return wrong value[%d %s]", + r2, XPUAPIErrorMsg[r2])); #endif @@ -1431,32 +1829,48 @@ void HeterComm::push_sparse( int uniq_len = len; merge_grad(dev_num, d_keys, d_grads, len, uniq_len); - split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, - dev_num); + split_input_to_shard( + d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); - heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys, - d_shard_grads_ptr, d_grads, d_idx_ptr, - (long long)uniq_len, stream); + heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, + d_keys, + d_shard_grads_ptr, + d_grads, + d_idx_ptr, + (long long)uniq_len, + stream); sync_stream(stream); auto dst_place = platform::CPUPlace(); auto src_place = place; - memory_copy(dst_place, h_left, src_place, d_left_ptr, - total_device * sizeof(int), stream); - memory_copy(dst_place, h_right, src_place, d_right_ptr, - total_device * sizeof(int), stream); + memory_copy(dst_place, + h_left, + src_place, + d_left_ptr, + total_device * sizeof(int), + stream); + memory_copy(dst_place, + h_right, + src_place, + d_right_ptr, + total_device * sizeof(int), + stream); for (int i = 0; i < total_device; ++i) { int shard_len = h_right[i] - h_left[i] + 1; if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(dev_num, i, shard_len * sizeof(KeyType), - shard_len * sizeof(GradType)); + create_storage( + dev_num, i, shard_len * sizeof(KeyType), shard_len * sizeof(GradType)); } - walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr, + walk_to_dest(dev_num, + total_device, + h_left, + h_right, + d_shard_keys_ptr, d_shard_grads_ptr); for (int i = 0; i < total_device; ++i) { @@ -1492,11 +1906,16 @@ void HeterComm::push_sparse( #endif #if defined(PADDLE_WITH_CUDA) -template template void HeterComm::update_one_table( - int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + int gpu_num, + KeyType* d_keys, + GradType* d_grads, + size_t len, Sgd& sgd) { // NOLINT if (len == 0) { return; @@ -1505,17 +1924,22 @@ void HeterComm::update_one_table( int dev_id = resource_->dev_id(gpu_num); platform::CUDADeviceGuard guard(dev_id); tables_[gpu_num]->rwlock_->WRLock(); - tables_[gpu_num]->update(d_keys, d_grads, len, sgd, - resource_->remote_stream(gpu_num, gpu_num)); + tables_[gpu_num]->update( + d_keys, d_grads, len, sgd, resource_->remote_stream(gpu_num, gpu_num)); tables_[gpu_num]->rwlock_->UNLock(); cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num)); } -template template void HeterComm::push_sparse_multi_node( - int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + int gpu_num, + KeyType* d_keys, + GradType* d_grads, + size_t len, Sgd& sgd) { // NOLINT if (len == 0) { return; @@ -1526,14 +1950,21 @@ void HeterComm::push_sparse_multi_node( uniq_len = gather_one_node_grad(gpu_num, d_keys, d_grads, uniq_len); - uniq_len = gather_multi_node_grad(gpu_num, storage_[gpu_num].local_keys, - storage_[gpu_num].local_grads, uniq_len); + uniq_len = gather_multi_node_grad(gpu_num, + storage_[gpu_num].local_keys, + storage_[gpu_num].local_grads, + uniq_len); - update_one_table(gpu_num, storage_[gpu_num].local_keys, - storage_[gpu_num].local_grads, uniq_len, sgd); + update_one_table(gpu_num, + storage_[gpu_num].local_keys, + storage_[gpu_num].local_grads, + uniq_len, + sgd); } -template int HeterComm::gather_one_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { @@ -1552,19 +1983,24 @@ int HeterComm::gather_one_node_grad( int* d_node_len = reinterpret_cast(d_node_len_mem->ptr()); h_node_len[gpu_num] = len; - cudaMemcpy(d_node_len + gpu_num, h_node_len + gpu_num, sizeof(int), + cudaMemcpy(d_node_len + gpu_num, + h_node_len + gpu_num, + sizeof(int), cudaMemcpyHostToDevice); // allgather grad len PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( - (const void*)(d_node_len + gpu_num), (void*)d_node_len, 1, // NOLINT - ncclInt, // NOLINT - nccl_inner_comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::ncclAllGather((const void*)(d_node_len + gpu_num), + (void*)d_node_len, + 1, // NOLINT + ncclInt, // NOLINT + nccl_inner_comm, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - cudaMemcpy(h_node_len, d_node_len, sizeof(int) * total_gpu, - cudaMemcpyDeviceToHost); + cudaMemcpy( + h_node_len, d_node_len, sizeof(int) * total_gpu, cudaMemcpyDeviceToHost); for (int i = 0; i < total_gpu; ++i) { if (h_node_len[i] > max_size) { @@ -1578,9 +2014,13 @@ int HeterComm::gather_one_node_grad( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( d_keys, storage.all_keys, max_size, ncclUint64, nccl_inner_comm, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( - d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8, - nccl_inner_comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::ncclAllGather(d_grads, + storage.all_grads, + max_size * sizeof(GradType), + ncclUint8, + nccl_inner_comm, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); @@ -1600,18 +2040,24 @@ int HeterComm::gather_one_node_grad( cudaMemset(d_left_ptr, -1, total_gpu * sizeof(int)); cudaMemset(d_right_ptr, -1, total_gpu * sizeof(int)); - split_input_to_shard(storage.all_keys + index, d_idx_ptr, h_node_len[i], - d_left_ptr, d_right_ptr, gpu_num); - cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost); - cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost); - - heter_comm_kernel_->fill_shard_grads( - storage.local_keys + merge_num, storage.all_keys + index, - storage.local_grads + merge_num, storage.all_grads + index, - d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1, - stream); + split_input_to_shard(storage.all_keys + index, + d_idx_ptr, + h_node_len[i], + d_left_ptr, + d_right_ptr, + gpu_num); + cudaMemcpy( + h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy( + h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); + + heter_comm_kernel_->fill_shard_grads(storage.local_keys + merge_num, + storage.all_keys + index, + storage.local_grads + merge_num, + storage.all_grads + index, + d_idx_ptr + h_left[gpu_num], + h_right[gpu_num] - h_left[gpu_num] + 1, + stream); merge_num = merge_num + h_right[gpu_num] - h_left[gpu_num] + 1; } @@ -1620,7 +2066,9 @@ int HeterComm::gather_one_node_grad( return ret; } -template int HeterComm::gather_multi_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { @@ -1645,8 +2093,8 @@ int HeterComm::gather_multi_node_grad( d_node_len, d_node_len, 1, ncclInt, nccl_inter_comm, stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - cudaMemcpy(h_node_len, d_node_len, sizeof(int) * node_size_, - cudaMemcpyDeviceToHost); + cudaMemcpy( + h_node_len, d_node_len, sizeof(int) * node_size_, cudaMemcpyDeviceToHost); for (int i = 0; i < node_size_; ++i) { if (h_node_len[i] > max_size) { @@ -1660,19 +2108,29 @@ int HeterComm::gather_multi_node_grad( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( d_keys, storage.all_keys, max_size, ncclUint64, nccl_inter_comm, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( - d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8, - nccl_inter_comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::ncclAllGather(d_grads, + storage.all_grads, + max_size * sizeof(GradType), + ncclUint8, + nccl_inter_comm, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); int merge_num = 0; for (int i = 0; i < node_size_; ++i) { int index = i * max_size; - cudaMemcpyAsync(storage.local_keys + merge_num, storage.all_keys + index, - h_node_len[i], cudaMemcpyDefault, stream); - cudaMemcpyAsync(storage.local_grads + merge_num, storage.all_grads + index, - h_node_len[i], cudaMemcpyDefault, stream); + cudaMemcpyAsync(storage.local_keys + merge_num, + storage.all_keys + index, + h_node_len[i], + cudaMemcpyDefault, + stream); + cudaMemcpyAsync(storage.local_grads + merge_num, + storage.all_grads + index, + h_node_len[i], + cudaMemcpyDefault, + stream); merge_num += h_node_len[i]; } @@ -1682,7 +2140,9 @@ int HeterComm::gather_multi_node_grad( } #endif -template void HeterComm::end_pass() { int total_device = resource_->total_device(); @@ -1706,14 +2166,21 @@ void HeterComm::end_pass() { } #if defined(PADDLE_WITH_CUDA) -template int HeterComm::dedup_keys_and_fillidx( - const int gpu_id, const int total_fea_num, + const int gpu_id, + const int total_fea_num, const KeyType* d_keys, // input KeyType* d_merged_keys, // output - KeyType* d_sorted_keys, uint32_t* d_restore_idx, uint32_t* d_sorted_idx, - uint32_t* d_offset, uint32_t* d_merged_cnts, bool filter_zero) { + KeyType* d_sorted_keys, + uint32_t* d_restore_idx, + uint32_t* d_sorted_idx, + uint32_t* d_offset, + uint32_t* d_merged_cnts, + bool filter_zero) { int dev_id = resource_->dev_id(gpu_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); @@ -1731,29 +2198,61 @@ int HeterComm::dedup_keys_and_fillidx( void* d_buf = NULL; size_t temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, d_keys, d_sorted_keys, d_index_in, d_sorted_idx, - total_fea_num, 0, 8 * sizeof(KeyType), stream, false)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(NULL, + temp_storage_bytes, + d_keys, + d_sorted_keys, + d_index_in, + d_sorted_idx, + total_fea_num, + 0, + 8 * sizeof(KeyType), + stream, + false)); auto d_cache_ptr = memory::Alloc(place, temp_storage_bytes); d_buf = reinterpret_cast(d_cache_ptr->ptr()); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - d_buf, temp_storage_bytes, d_keys, d_sorted_keys, d_index_in, - d_sorted_idx, total_fea_num, 0, 8 * sizeof(KeyType), stream, false)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(d_buf, + temp_storage_bytes, + d_keys, + d_sorted_keys, + d_index_in, + d_sorted_idx, + total_fea_num, + 0, + 8 * sizeof(KeyType), + stream, + false)); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - NULL, temp_storage_bytes, d_sorted_keys, d_merged_keys, d_merged_cnts, - d_merged_size, total_fea_num, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(NULL, + temp_storage_bytes, + d_sorted_keys, + d_merged_keys, + d_merged_cnts, + d_merged_size, + total_fea_num, + stream)); if (d_cache_ptr->size() < temp_storage_bytes) { d_cache_ptr = NULL; d_cache_ptr = memory::Alloc(place, temp_storage_bytes); } d_buf = reinterpret_cast(d_cache_ptr->ptr()); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - d_buf, temp_storage_bytes, d_sorted_keys, d_merged_keys, d_merged_cnts, - d_merged_size, total_fea_num, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(d_buf, + temp_storage_bytes, + d_sorted_keys, + d_merged_keys, + d_merged_cnts, + d_merged_size, + total_fea_num, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync((void*)&merged_size, - (void*)d_merged_size, sizeof(int), - cudaMemcpyDeviceToHost, stream)); + (void*)d_merged_size, + sizeof(int), + cudaMemcpyDeviceToHost, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( @@ -1770,9 +2269,15 @@ int HeterComm::dedup_keys_and_fillidx( cudaMemsetAsync(d_restore_idx, 0, total_fea_num * sizeof(uint32_t), stream); } // fill restore idx [1,3,5,2,4,6] = [1,2,1,3,2,1] - heter_comm_kernel_->fill_restore_idx(filter_zero, total_fea_num, merged_size, - d_merged_keys, d_sorted_idx, d_offset, - d_merged_cnts, d_restore_idx, stream); + heter_comm_kernel_->fill_restore_idx(filter_zero, + total_fea_num, + merged_size, + d_merged_keys, + d_sorted_idx, + d_offset, + d_merged_cnts, + d_restore_idx, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index 616bd63618a576..cb02773bc034f6 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -95,11 +95,6 @@ class HeterCommKernel { HeterCommKernel() {} explicit HeterCommKernel(const int block_size) : block_size_(block_size) {} - // explicit HeterCommKernel(const int block_size, - // CommonFeatureValueAccessor& feature_value_accessor) - // : block_size_(block_size), - // feature_value_accessor_(feature_value_accessor) {} - template void fill_idx(T* idx, long long len, const StreamType& stream); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc index 855842644abf72..86e339ca74a307 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" #include +#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" #ifdef PADDLE_WITH_HETERPS @@ -21,9 +21,11 @@ namespace paddle { namespace framework { HeterPsBase* HeterPsBase::get_instance( - size_t capacity, std::shared_ptr resource, + size_t capacity, + std::shared_ptr resource, std::unordered_map fleet_config, - std::string accessor_type, int optimizer_type) { + std::string accessor_type, + int optimizer_type) { if (accessor_type == "CtrDymfAccessor" && (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { return new HeterPs( @@ -37,9 +39,11 @@ HeterPsBase* HeterPsBase::get_instance( } } -HeterPs::HeterPs(size_t capacity, std::shared_ptr resource, +HeterPs::HeterPs(size_t capacity, + std::shared_ptr resource, std::unordered_map fleet_config, - std::string accessor_type, int optimizer_type) { + std::string accessor_type, + int optimizer_type) { comm_ = std::make_shared>( capacity, resource); optimizer_type_ = optimizer_type; @@ -47,7 +51,9 @@ HeterPs::HeterPs(size_t capacity, std::shared_ptr resource, HeterPs::~HeterPs() {} -void HeterPs::pull_sparse(int num, FeatureKey* d_keys, float* d_vals, +void HeterPs::pull_sparse(int num, + FeatureKey* d_keys, + float* d_vals, size_t len) { comm_->pull_sparse(num, d_keys, d_vals, len); } @@ -68,7 +74,9 @@ void HeterPs::end_pass() { comm_->end_pass(); } void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } -void HeterPs::push_sparse(int num, FeatureKey* d_keys, float* d_grads, +void HeterPs::push_sparse(int num, + FeatureKey* d_keys, + float* d_grads, size_t len) { comm_->push_sparse(num, d_keys, d_grads, len); // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 15de47a62283af..4d64375f936566 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -45,10 +45,12 @@ namespace paddle { namespace framework { #ifdef PADDLE_WITH_PSLIB -void AfsWrapper::init(const std::string& fs_name, const std::string& fs_user, - const std::string& pass_wd, const std::string& conf) { - int ret = afs_handler_.init(fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), - conf.c_str()); +void AfsWrapper::init(const std::string& fs_name, + const std::string& fs_user, + const std::string& pass_wd, + const std::string& conf) { + int ret = afs_handler_.init( + fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), conf.c_str()); if (ret != 0) { LOG(ERROR) << "AFS Init Error"; } @@ -100,8 +102,8 @@ void PSGPUWrapper::InitAfsApi(const std::string& fs_name, const std::string& fs_user, const std::string& pass_wd, const std::string& conf) { - int ret = afs_handler_.init(fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), - conf.c_str()); + int ret = afs_handler_.init( + fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), conf.c_str()); if (ret != 0) { VLOG(0) << "AFS Init Error"; } @@ -148,16 +150,20 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(0) << "total len: " << total_len; auto gen_dynamic_mf_func = [this]( const std::deque& total_data, - int begin_index, int end_index, int i) { + int begin_index, + int end_index, + int i) { for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { + iter != total_data.begin() + end_index; + iter++) { const auto& ins = *iter; const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets; for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size(); slot_idx++) { for (size_t j = slot_offset[slot_offset_vector_[slot_idx]]; - j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) { + j < slot_offset[slot_offset_vector_[slot_idx] + 1]; + j++) { int shard_id = feasign_v[j] % thread_keys_shard_num_; int dim_id = slot_index_vec_[slot_idx]; if (feasign_v[j] != 0) { @@ -170,8 +176,11 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { }; for (int i = 0; i < thread_keys_thread_num_; i++) { threads.push_back( - std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), i)); + std::thread(gen_dynamic_mf_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); begin += len_per_thread + (i < remain ? 1 : 0); } @@ -192,9 +201,12 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { len_per_thread = total_len / thread_keys_thread_num_; remain = total_len % thread_keys_thread_num_; auto gen_func = [this](const std::deque& total_data, - int begin_index, int end_index, int i) { + int begin_index, + int end_index, + int i) { for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { + iter != total_data.begin() + end_index; + iter++) { const auto& ins = *iter; const auto& feasign_v = ins.uint64_feasigns_; for (const auto feasign : feasign_v) { @@ -206,8 +218,11 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { }; for (int i = 0; i < thread_keys_thread_num_; i++) { threads.push_back( - std::thread(gen_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), i)); + std::thread(gen_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); begin += len_per_thread + (i < remain ? 1 : 0); } for (std::thread& t : threads) { @@ -227,19 +242,25 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(0) << "GpuGraphTotalKeys: " << total_len; remain = total_len % thread_keys_thread_num_; auto gen_graph_data_func = [this](const std::vector& total_data, - int begin_index, int end_index, int i) { + int begin_index, + int end_index, + int i) { for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { + iter != total_data.begin() + end_index; + iter++) { uint64_t cur_key = *iter; int shard_id = cur_key % thread_keys_shard_num_; this->thread_keys_[i][shard_id].insert(cur_key); } }; auto gen_graph_dynamic_mf_func = - [this](const std::vector& total_data, int begin_index, - int end_index, int i) { + [this](const std::vector& total_data, + int begin_index, + int end_index, + int i) { for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { + iter != total_data.begin() + end_index; + iter++) { uint64_t cur_key = *iter; int shard_id = cur_key % thread_keys_shard_num_; // TODO: feasign <-> slot <-> multi_dim @@ -250,13 +271,19 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { if (!multi_mf_dim_) { VLOG(1) << "psgpu graph wrapper genfunc"; threads.push_back( - std::thread(gen_graph_data_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), i)); + std::thread(gen_graph_data_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); } else { VLOG(1) << "psgpu graph wrapper genfunc with dynamic mf"; threads.push_back( - std::thread(gen_graph_dynamic_mf_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), i)); + std::thread(gen_graph_dynamic_mf_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); } begin += len_per_thread + (i < remain ? 1 : 0); } @@ -271,8 +298,8 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { // merge thread_keys to shard_keys auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) { for (int i = 0; i < thread_keys_thread_num_; ++i) { - gpu_task->batch_add_keys(shard_num, dim_id, - thread_dim_keys_[i][shard_num][dim_id]); + gpu_task->batch_add_keys( + shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]); thread_dim_keys_[i][shard_num][dim_id].clear(); } }; @@ -349,82 +376,87 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { timeline.Start(); - auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr, - &fleet_ptr](int i, int j) { - size_t key_size = local_dim_keys[i][j].size(); - int32_t status = -1; - int32_t cnt = 0; + auto ptl_dynamic_mf_func = + [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr](int i, int j) { + size_t key_size = local_dim_keys[i][j].size(); + int32_t status = -1; + int32_t cnt = 0; #ifdef PADDLE_WITH_PSLIB - while (true) { - auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - i, reinterpret_cast(local_dim_ptr[i][j].data()), - this->table_id_, local_dim_keys[i][j].data(), key_size); - bool flag = true; - - tt.wait(); - - try { - status = tt.get(); - } catch (const std::future_error& e) { - VLOG(0) << "Caught a future_error with code" << e.code() - << ", Message:" << e.what(); - } - if (status != 0) { - VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; - sleep(sleep_seconds_before_fail_exit_); - flag = false; - cnt++; - } - if (cnt > 3) { - VLOG(0) << "fleet pull sparse failed, retry 3 times"; - exit(-1); - } + while (true) { + auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( + i, + reinterpret_cast(local_dim_ptr[i][j].data()), + this->table_id_, + local_dim_keys[i][j].data(), + key_size); + bool flag = true; + + tt.wait(); + + try { + status = tt.get(); + } catch (const std::future_error& e) { + VLOG(0) << "Caught a future_error with code" << e.code() + << ", Message:" << e.what(); + } + if (status != 0) { + VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); + flag = false; + cnt++; + } + if (cnt > 3) { + VLOG(0) << "fleet pull sparse failed, retry 3 times"; + exit(-1); + } - if (flag) { - break; - } - } + if (flag) { + break; + } + } #endif #ifdef PADDLE_WITH_PSCORE - while (true) { - auto tt = fleet_ptr->worker_ptr_->PullSparsePtr( - reinterpret_cast(local_dim_ptr[i][j].data()), this->table_id_, - local_dim_keys[i][j].data(), key_size); - bool flag = true; - - tt.wait(); - - try { - status = tt.get(); - } catch (const std::future_error& e) { - VLOG(0) << "Caught a future_error with code" << e.code() - << ", Message:" << e.what(); - } - if (status != 0) { - VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; - sleep(sleep_seconds_before_fail_exit_); - flag = false; - cnt++; - } - if (cnt > 3) { - VLOG(0) << "fleet pull sparse failed, retry 3 times"; - exit(-1); - } + while (true) { + auto tt = fleet_ptr->worker_ptr_->PullSparsePtr( + reinterpret_cast(local_dim_ptr[i][j].data()), + this->table_id_, + local_dim_keys[i][j].data(), + key_size); + bool flag = true; + + tt.wait(); + + try { + status = tt.get(); + } catch (const std::future_error& e) { + VLOG(0) << "Caught a future_error with code" << e.code() + << ", Message:" << e.what(); + } + if (status != 0) { + VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); + flag = false; + cnt++; + } + if (cnt > 3) { + VLOG(0) << "fleet pull sparse failed, retry 3 times"; + exit(-1); + } - if (flag) { - break; - } - } + if (flag) { + break; + } + } #endif - if (status != 0) { - LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; - sleep(300); - exit(-1); - } else { - VLOG(0) << "FleetWrapper Pull sparse to local done with table size: " - << local_dim_keys[i][j].size(); - } - }; + if (status != 0) { + LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; + sleep(300); + exit(-1); + } else { + VLOG(0) << "FleetWrapper Pull sparse to local done with table size: " + << local_dim_keys[i][j].size(); + } + }; threads.resize(thread_keys_shard_num_ * multi_mf_dim_); for (int i = 0; i < thread_keys_shard_num_; i++) { @@ -455,8 +487,11 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { bool record_status = false; auto& device_task_keys = gpu_task->device_task_keys_; auto& device_task_ptrs = gpu_task->device_task_ptr_; - auto build_pull_dynamic_mf_func = [this, device_num, &local_dim_keys, - &local_dim_ptr, &device_dim_keys, + auto build_pull_dynamic_mf_func = [this, + device_num, + &local_dim_keys, + &local_dim_ptr, + &device_dim_keys, &device_dim_ptr, &device_dim_mutex](int i, int j) { std::vector> task_keys(device_num); @@ -488,8 +523,13 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { device_dim_mutex[dev][j]->unlock(); } }; - auto build_func = [device_num, record_status, &pass_values, &local_keys, - &local_ptr, &device_task_keys, &device_task_ptrs](int i) { + auto build_func = [device_num, + record_status, + &pass_values, + &local_keys, + &local_ptr, + &device_task_keys, + &device_task_ptrs](int i) { auto& task_keys = device_task_keys[i]; #ifdef PADDLE_WITH_PSLIB auto& task_ptrs = device_task_ptrs[i]; @@ -586,8 +626,8 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { std::vector threads(device_num); auto accessor_wrapper_ptr = GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); - HeterPs_ = HeterPsBase::get_instance(size_max, resource_, fleet_config_, - accessor_class_, optimizer_type_); + HeterPs_ = HeterPsBase::get_instance( + size_max, resource_, fleet_config_, accessor_class_, optimizer_type_); #ifdef PADDLE_WITH_CUDA HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_); HeterPs_->set_sparse_sgd(optimizer_config_); @@ -656,8 +696,8 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { // << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) // << " |: " << cpu_table_accessor_->ParseToString(ptr_val, // dim); - accessor_wrapper_ptr->BuildFill(val, device_dim_ptrs[k], - cpu_table_accessor_, mf_dim); + accessor_wrapper_ptr->BuildFill( + val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim); VLOG(5) << "build " << k << " : " << accessor_wrapper_ptr->ParseToString( (float*)(val), @@ -671,8 +711,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool); auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - this->HeterPs_->build_ps(i, device_dim_keys.data(), cur_pool->mem(), len, - feature_value_size, 500000, 2); + this->HeterPs_->build_ps(i, + device_dim_keys.data(), + cur_pool->mem(), + len, + feature_value_size, + 500000, + 2); if (device_dim_keys.size() > 0) { VLOG(0) << "show ptr table: " << i @@ -827,7 +872,9 @@ void PSGPUWrapper::EndPass() { << " feature_value_size:" << feature_value_size; char* test_build_values = (char*)malloc(feature_value_size * len); - cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len, + cudaMemcpy(test_build_values, + hbm_pool->mem(), + feature_value_size * len, cudaMemcpyDeviceToHost); CHECK(len == hbm_pool->capacity()); @@ -951,25 +998,40 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, int64_t* slot_lens = dev.slot_lens.mutable_data( (slot_num + 1) * sizeof(int64_t), place); - cudaMemcpyAsync(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), - cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(slot_lens, slot_lengths_lod.data(), + cudaMemcpyAsync(gpu_keys, + keys.data(), + keys.size() * sizeof(uint64_t*), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(slot_lens, + slot_lengths_lod.data(), slot_lengths_lod.size() * sizeof(int64_t), - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, + stream); - cudaMemcpyAsync(gpu_slot_dims, slot_dim.data(), - slot_dim.size() * sizeof(int), cudaMemcpyHostToDevice, + cudaMemcpyAsync(gpu_slot_dims, + slot_dim.data(), + slot_dim.size() * sizeof(int), + cudaMemcpyHostToDevice, stream); float** gpu_values = dev.values_ptr_tensor.mutable_data( values.size() * sizeof(float*), place); - cudaMemcpyAsync(gpu_values, values.data(), values.size() * sizeof(float*), - cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(gpu_values, + values.data(), + values.size() * sizeof(float*), + cudaMemcpyHostToDevice, + stream); int* key2slot = dev.keys2slot.mutable_data( (total_length * 5) * sizeof(int), place); - this->CopyKeys(place, gpu_keys, total_keys, slot_lens, slot_num, - static_cast(total_length), key2slot); + this->CopyKeys(place, + gpu_keys, + total_keys, + slot_lens, + slot_num, + static_cast(total_length), + key2slot); uint32_t* d_restore_idx = reinterpret_cast(&key2slot[total_length]); @@ -985,20 +1047,23 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, reinterpret_cast(&d_merged_keys[total_length]); int dedup_size = HeterPs_->dedup_keys_and_fillidx( - devid_2_index, static_cast(total_length), + devid_2_index, + static_cast(total_length), total_keys, // input d_merged_keys, // output d_sorted_keys, // sort keys d_restore_idx, // pull fill idx d_sorted_idx, // sort old idx d_offset, // offset - d_merged_cnts, FLAGS_gpugraph_dedup_pull_push_mode & 0x02); + d_merged_cnts, + FLAGS_gpugraph_dedup_pull_push_mode & 0x02); // printf("device %d, end dedup_keys_and_fillidx total %d, " // "dedup_size %d, slot num: %d, value size: %d\n", // device_id, int(total_length), dedup_size, slot_num, // int(feature_value_size)); - PADDLE_ENFORCE_GT(dedup_size, 0, + PADDLE_ENFORCE_GT(dedup_size, + 0, platform::errors::PreconditionNotMet( "dedup keys need more than zero failed in BoxPS.")); dev.dedup_key_length = dedup_size; @@ -1007,13 +1072,21 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, float* total_values_gpu = dev.pull_push_tensor.mutable_data(total_bytes, place); pull_gpups_timer.Start(); - HeterPs_->pull_sparse(devid_2_index, d_merged_keys, total_values_gpu, - dedup_size); + HeterPs_->pull_sparse( + devid_2_index, d_merged_keys, total_values_gpu, dedup_size); // values.size() not sure equal slot_num - accessor_wrapper_ptr->CopyForPull(place, total_keys, gpu_values, total_values_gpu, - slot_lens, key2slot, max_mf_dim_ + 3, total_length, - gpu_slot_dims, d_restore_idx, feature_value_size); + accessor_wrapper_ptr->CopyForPull(place, + total_keys, + gpu_values, + total_values_gpu, + slot_lens, + key2slot, + max_mf_dim_ + 3, + total_length, + gpu_slot_dims, + d_restore_idx, + feature_value_size); } else { size_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); @@ -1034,33 +1107,48 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), + cudaMemcpy(gpu_keys, + keys.data(), + keys.size() * sizeof(uint64_t*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, + slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); auto buf_dim = memory::Alloc(place, slot_dim.size() * sizeof(int)); int* gpu_dim = reinterpret_cast(buf_dim->ptr()); - cudaMemcpy(gpu_dim, slot_dim.data(), slot_dim.size() * sizeof(int), + cudaMemcpy(gpu_dim, + slot_dim.data(), + slot_dim.size() * sizeof(int), cudaMemcpyHostToDevice); - this->CopyKeys(place, gpu_keys, total_keys, gpu_len, + this->CopyKeys(place, + gpu_keys, + total_keys, + gpu_len, static_cast(slot_lengths.size()), static_cast(total_length)); VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index << " len: " << total_length; pull_gpups_timer.Start(); - HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, - total_length); + HeterPs_->pull_sparse( + devid_2_index, total_keys, total_values_gpu, total_length); VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length << "]"; - accessor_wrapper_ptr->CopyForPull( - place, gpu_keys, values, total_values_gpu, gpu_len, - static_cast(slot_lengths.size()), hidden_size, total_length, - gpu_dim, feature_value_size); + accessor_wrapper_ptr->CopyForPull(place, + gpu_keys, + values, + total_values_gpu, + gpu_len, + static_cast(slot_lengths.size()), + hidden_size, + total_length, + gpu_dim, + feature_value_size); } pull_gpups_timer.Pause(); @@ -1092,29 +1180,41 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); uint64_t** xpu_keys = reinterpret_cast(buf_key->ptr()); int64_t* xpu_len = reinterpret_cast(buf_length->ptr()); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys, keys.data(), + PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys, + keys.data(), keys.size() * sizeof(uint64_t*), XPU_HOST_TO_DEVICE)); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_len, slot_lengths_lod.data(), + PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_len, + slot_lengths_lod.data(), slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE)); - this->CopyKeys(place, xpu_keys, total_keys, xpu_len, + this->CopyKeys(place, + xpu_keys, + total_keys, + xpu_len, static_cast(slot_lengths.size()), static_cast(total_length)); VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index << " len: " << total_length; pull_gpups_timer.Start(); - HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, + HeterPs_->pull_sparse(devid_2_index, + total_keys, + total_values_gpu, static_cast(total_length)); pull_gpups_timer.Pause(); VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length << "]"; - accessor_wrapper_ptr->CopyForPull( - place, xpu_keys, values, total_values_gpu, xpu_len, - static_cast(slot_lengths.size()), hidden_size, total_length, - feature_value_size); + accessor_wrapper_ptr->CopyForPull(place, + xpu_keys, + values, + total_values_gpu, + xpu_len, + static_cast(slot_lengths.size()), + hidden_size, + total_length, + feature_value_size); #endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -1132,7 +1232,8 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, const std::vector& keys, const std::vector& grad_values, const std::vector& slot_lengths, - const int hidden_size, const int batch_size) { + const int hidden_size, + const int batch_size) { platform::Timer all_timer; platform::Timer push_gpups_timer; all_timer.Start(); @@ -1162,17 +1263,22 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, if (!dev.d_slot_vector.IsInitialized()) { int* buf_slot_vector = dev.d_slot_vector.mutable_data(slot_num * sizeof(int), place); - cudaMemcpyAsync(buf_slot_vector, slot_vector_.data(), - slot_num * sizeof(int), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(buf_slot_vector, + slot_vector_.data(), + slot_num * sizeof(int), + cudaMemcpyHostToDevice, + stream); } const int64_t* slot_lens = dev.slot_lens.data(); const int* d_slot_vector = dev.d_slot_vector.data(); const int* key2slot = dev.keys2slot.data(); float** gpu_values = dev.values_ptr_tensor.data(); - cudaMemcpyAsync(gpu_values, grad_values.data(), + cudaMemcpyAsync(gpu_values, + grad_values.data(), grad_values.size() * sizeof(float*), - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, + stream); uint64_t* d_merged_keys = &total_keys[total_length]; @@ -1184,10 +1290,20 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, if (total_length > dedup_size * 3) { const uint32_t* d_restore_idx = reinterpret_cast(&key2slot[total_length]); - accessor_wrapper_ptr->CopyForPush(place, total_keys, gpu_values, total_grad_values_gpu, - d_slot_vector, slot_lens, max_mf_dim_ + 3, - total_length, dedup_size, batch_size, slot_dims, - key2slot, d_restore_idx, grad_value_size); + accessor_wrapper_ptr->CopyForPush(place, + total_keys, + gpu_values, + total_grad_values_gpu, + d_slot_vector, + slot_lens, + max_mf_dim_ + 3, + total_length, + dedup_size, + batch_size, + slot_dims, + key2slot, + d_restore_idx, + grad_value_size); } else { const uint32_t* d_sorted_idx = reinterpret_cast(&key2slot[total_length * 2]); @@ -1195,15 +1311,28 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, reinterpret_cast(&d_sorted_idx[total_length]); const uint32_t* d_merged_cnts = reinterpret_cast(&d_offset[total_length]); - accessor_wrapper_ptr->CopyForPush(place, d_merged_keys, gpu_values, - total_grad_values_gpu, d_slot_vector, slot_lens, - max_mf_dim_ + 3, total_length, dedup_size, batch_size, - slot_dims, key2slot, d_sorted_idx, d_offset, - d_merged_cnts, grad_value_size); + accessor_wrapper_ptr->CopyForPush(place, + d_merged_keys, + gpu_values, + total_grad_values_gpu, + d_slot_vector, + slot_lens, + max_mf_dim_ + 3, + total_length, + dedup_size, + batch_size, + slot_dims, + key2slot, + d_sorted_idx, + d_offset, + d_merged_cnts, + grad_value_size); } push_gpups_timer.Start(); - HeterPs_->push_sparse(devid_2_index, d_merged_keys, total_grad_values_gpu, + HeterPs_->push_sparse(devid_2_index, + d_merged_keys, + total_grad_values_gpu, static_cast(dedup_size)); } else { int64_t total_length = @@ -1220,14 +1349,22 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, reinterpret_cast(total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to gpups struct"; - accessor_wrapper_ptr->CopyForPush( - place, grad_values, total_grad_values_gpu, slot_lengths, total_length, - batch_size, grad_value_size, slot_vector_, slot_mf_dim_vector_); + accessor_wrapper_ptr->CopyForPush(place, + grad_values, + total_grad_values_gpu, + slot_lengths, + total_length, + batch_size, + grad_value_size, + slot_vector_, + slot_mf_dim_vector_); VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index << " len: " << total_length; push_gpups_timer.Start(); - HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu, + HeterPs_->push_sparse(devid_2_index, + total_keys, + total_grad_values_gpu, static_cast(total_length)); } push_gpups_timer.Pause(); @@ -1248,14 +1385,21 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, uint64_t* total_keys = reinterpret_cast(total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to xpups struct"; - accessor_wrapper_ptr->CopyForPush(place, grad_values, total_grad_values_gpu, - slot_lengths, hidden_size, total_length, - batch_size, slot_vector_); + accessor_wrapper_ptr->CopyForPush(place, + grad_values, + total_grad_values_gpu, + slot_lengths, + hidden_size, + total_length, + batch_size, + slot_vector_); VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index << " len: " << total_length; push_gpups_timer.Start(); - HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu, + HeterPs_->push_sparse(devid_2_index, + total_keys, + total_grad_values_gpu, static_cast(total_length)); push_gpups_timer.Pause(); #endif diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 2a1f08293cd614..b69cbccd0c1c9a 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -363,7 +363,7 @@ class PSGPUWrapper { config["nodeid_slot"] = sparse_table_accessor.graph_sgd_param().nodeid_slot(); config["feature_learning_rate"] = sparse_table_accessor.graph_sgd_param().feature_learning_rate(); - if (accessor_class == "CtrDymfAccessor") { + if (accessor_class_ == "CtrDymfAccessor") { // optimizer config for embed_w and embedx add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param()); add_sparse_optimizer(config, sparse_table_accessor.embedx_sgd_param(),