Skip to content

Commit

Permalink
[GPUPS]Config fleet optimize 2 (#39783)
Browse files Browse the repository at this point in the history
* update. test=develop

* update. test=develop

* fix. test=develop

* update. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update. test=develop

* update. test=develop
  • Loading branch information
zmxdream authored Feb 22, 2022
1 parent 85a11c4 commit 0efa64c
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 0 deletions.
160 changes: 160 additions & 0 deletions paddle/fluid/framework/ps_gpu_trainer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <google/protobuf/text_format.h>
#include <cstdlib>
#include <string>
#include <vector>
Expand All @@ -20,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
Expand All @@ -44,6 +46,164 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
dense_grad_names_[table_id][j] = table.dense_grad_name(j);
}
}
// add for hbmps optimizer config
auto fleet_desc_str = trainer_desc.fleet_desc();
google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param);
auto sparse_table =
_ps_param.server_param().downpour_server_param().downpour_table_param(0);
auto sparse_table_accessor = sparse_table.accessor();
auto sparse_table_accessor_parameter =
sparse_table_accessor.downpour_accessor_param();
auto accessor_class = sparse_table_accessor.accessor_class();
// gpups' sparse table optimizer config
// now only support single sparse table
// auto sparse_table = param_.sparse_table(0);
std::unordered_map<std::string, float> config;
if (accessor_class == "DownpourFeatureValueAccessor" ||
accessor_class == "DownpourCtrAccessor" ||
accessor_class == "DownpourCtrDoubleAccessor") {
config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
config["learning_rate"] =
sparse_table_accessor.sparse_sgd_param().learning_rate();
config["initial_g2sum"] =
sparse_table_accessor.sparse_sgd_param().initial_g2sum();
config["initial_range"] =
sparse_table_accessor.sparse_sgd_param().initial_range();
if (sparse_table_accessor.sparse_sgd_param().weight_bounds_size() == 2) {
config["min_bound"] =
sparse_table_accessor.sparse_sgd_param().weight_bounds()[0];
config["max_bound"] =
sparse_table_accessor.sparse_sgd_param().weight_bounds()[1];
}
config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
} else if (accessor_class == "DownpourSparseValueAccessor") {
auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name();
if (optimizer_name == "naive") {
config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param()
.naive()
.learning_rate();
config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param()
.naive()
.initial_range();
if (sparse_table_accessor.sparse_commonsgd_param()
.naive()
.weight_bounds_size() == 2) {
config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.naive()
.weight_bounds()[0];
config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.naive()
.weight_bounds()[1];
}
} else if (optimizer_name == "adagrad") {
config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.learning_rate();
config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.initial_range();
config["initial_g2sum"] = sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.initial_g2sum();
if (sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.weight_bounds_size() == 2) {
config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.weight_bounds()[0];
config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.weight_bounds()[1];
}
} else if (optimizer_name == "adam") {
config["learning_rate"] =
sparse_table_accessor.sparse_commonsgd_param().adam().learning_rate();
config["initial_range"] =
sparse_table_accessor.sparse_commonsgd_param().adam().initial_range();
if (sparse_table_accessor.sparse_commonsgd_param()
.adam()
.weight_bounds_size() == 2) {
config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.adam()
.weight_bounds()[0];
config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.adam()
.weight_bounds()[1];
}
}
} else if (accessor_class == "DownpourUnitAccessor" ||
accessor_class == "DownpourDoubleUnitAccessor") {
config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name();
if (optimizer_name == "naive") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().naive().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().naive().initial_range();
if (sparse_table_accessor.embedx_sgd_param()
.naive()
.weight_bounds_size() == 2) {
config["mf_min_bound"] =
sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0];
config["mf_max_bound"] =
sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1];
}
} else if (optimizer_name == "adagrad") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
config["mf_initial_g2sum"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
if (sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds_size() == 2) {
config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[0];
config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[1];
}
} else if (optimizer_name == "std_adagrad") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
config["mf_initial_g2sum"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
if (sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds_size() == 2) {
config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[0];
config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[1];
}
} else if (optimizer_name == "adam") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().adam().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().adam().initial_range();
if (sparse_table_accessor.embedx_sgd_param()
.adam()
.weight_bounds_size() == 2) {
config["mf_min_bound"] =
sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0];
config["mf_max_bound"] =
sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1];
}
}
config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
}

auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
ps_gpu_wrapper->InitializeGPUServer(config);

scale_datanorm_ = trainer_desc.scale_datanorm();
int place_num = trainer_desc.worker_places_size();
const std::vector<paddle::framework::DataFeed*> readers =
Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/framework/trainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ limitations under the License. */
#include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/phi/backends/dynload/port.h"

#ifdef PADDLE_WITH_PSLIB
#include <pslib.h>
#endif

namespace paddle {
namespace framework {

Expand Down Expand Up @@ -287,6 +291,9 @@ class PSGPUTrainer : public TrainerBase {
int mpi_rank_;
int mpi_size_;
int dump_file_num_;

// _ps_param for gpups optimizer config
::paddle::PSParameter _ps_param;
};
#endif

Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/framework/trainer_desc.proto
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ message TrainerDesc {
repeated int32 trainers = 35;
optional int32 trainer_id = 36;

// add for gpu
optional string fleet_desc = 37;

// device worker parameters
optional HogwildWorkerParameter hogwild_param = 101;
optional DownpourWorkerParameter downpour_param = 103;
Expand Down
4 changes: 4 additions & 0 deletions python/paddle/fluid/trainer_desc.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ def _set_infer(self, infer):

def _set_fleet_desc(self, fleet_desc):
self._fleet_desc = fleet_desc
## serialize fleet_desc
from google.protobuf import text_format
fleet_desc_str = text_format.MessageToString(fleet_desc)
self.proto_desc.fleet_desc = fleet_desc_str

def _gen_trainer_desc(self):
pass
Expand Down

0 comments on commit 0efa64c

Please sign in to comment.