From 17b36687017bbd34fdd50f51a93c00aae1fbed5c Mon Sep 17 00:00:00 2001 From: Fan Yu Date: Tue, 5 Jan 2021 22:34:48 +0800 Subject: [PATCH] Fix building issue #2 --- HugeCTR/include/parser.hpp | 226 +-- HugeCTR/src/inference/embedding_cache.cu | 2 + HugeCTR/src/inference/parameter_server.cpp | 2 + HugeCTR/src/parser.cpp | 1432 ++++++++++++++++++-- 4 files changed, 1402 insertions(+), 260 deletions(-) diff --git a/HugeCTR/include/parser.hpp b/HugeCTR/include/parser.hpp index 006cc874d7..5a28f6a5d0 100644 --- a/HugeCTR/include/parser.hpp +++ b/HugeCTR/include/parser.hpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -29,45 +28,6 @@ namespace HugeCTR { -nlohmann::json read_json_file(const std::string& filename); - -struct SolverParser { - // std::string configure_file; - unsigned long long seed; /**< seed of data simulator */ - LrPolicy_t lr_policy; /**< the only fixed lr is supported now. */ - int display; /**< the interval of loss display. */ - int max_iter; /**< the number of iterations for training */ - int num_epochs; /**< the number of epochs for training */ - int snapshot; /**< the number of iterations for a snapshot */ - std::string snapshot_prefix; /**< naming prefix of snapshot file */ - int eval_interval; /**< the interval of evaluations */ - int eval_batches; /**< the number of batches for evaluations */ - int batchsize_eval; /**< batchsize for eval */ - int batchsize; /**< batchsize */ - std::string model_file; /**< name of model file */ - std::vector embedding_files; /**< name of embedding file */ - std::vector> vvgpu; /**< device map */ - bool use_mixed_precision; - float scaler; - std::map metrics_spec; - bool i64_input_key; - bool use_algorithm_search; - bool use_cuda_graph; - SolverParser(const std::string& file); - SolverParser() {} -}; -struct InferenceParser { - // std::string configure_file; - size_t max_batchsize; /**< batchsize */ - std::string dense_model_file; /**< name of model file */ - std::vector sparse_model_files; /**< name of embedding file */ - bool use_mixed_precision; - float scaler; - bool use_algorithm_search; - bool use_cuda_graph; - InferenceParser(const nlohmann::json& config); -}; - /** * @brief The parser of configure file (in json format). * @@ -91,37 +51,42 @@ class Parser { const bool use_algorithm_search_; const bool use_cuda_graph_; - template - void create_pipeline_internal(std::shared_ptr& data_reader, - std::shared_ptr& data_reader_eval, - std::vector>& embedding, - std::vector>& network, - const std::shared_ptr& resource_manager); - - template - void create_pipeline_inference(const InferenceParser& inference_parser, - Tensor2& dense_input, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embedding, Network** network, - const std::shared_ptr resource_manager); - public: - std::vector tensor_entries; /** * Ctor. * Ctor only verify the configure file, doesn't create pipeline. */ - Parser(const std::string& configure_file, size_t batch_size, size_t batch_size_eval, - bool repeat_dataset, bool i64_input_key = false, bool use_mixed_precision = false, - float scaler = 1.0f, bool use_algorithm_search = true, bool use_cuda_graph = true); - /** - * Ctor. - * Ctor used in inference stage - */ - Parser(const nlohmann::json& config); + Parser(const std::string& configure_file, + size_t batch_size, + size_t batch_size_eval, + bool repeat_dataset, + bool i64_input_key = false, + bool use_mixed_precision = false, + float scaler = 1.0f, + bool use_algorithm_search = true, + bool use_cuda_graph = true) + : batch_size_(batch_size), + batch_size_eval_(batch_size_eval), + repeat_dataset_(repeat_dataset), + i64_input_key_(i64_input_key), + use_mixed_precision_(use_mixed_precision), + scaler_(scaler), + use_algorithm_search_(use_algorithm_search), + use_cuda_graph_(use_cuda_graph) { + try { + std::ifstream file(configure_file); + if (!file.is_open()) { + CK_THROW_(Error_t::FileCannotOpen, "file.is_open() failed: " + configure_file); + } + file >> config_; + file.close(); + } catch (const std::runtime_error& rt_err) { + std::cerr << rt_err.what() << std::endl; + throw; + } + return; + } /** * Create the pipeline, which includes data reader, embedding. @@ -132,15 +97,14 @@ class Parser { std::vector>& network, const std::shared_ptr& resource_manager); - /** - * Create inference pipeline, which only creates network and embedding - */ - void create_pipeline(const InferenceParser& inference_parser, Tensor2& dense_input, - std::vector>>& row, - std::vector>>& embeddingvec, - std::vector& embedding_table_slot_size, - std::vector>* embedding, Network** network, - const std::shared_ptr resource_manager); + template + friend void create_pipeline_internal(std::shared_ptr& data_reader, + std::shared_ptr& data_reader_eval, + std::vector>& embedding, + std::vector>& network, + const std::shared_ptr& resource_manager, + Parser& parser); + }; std::unique_ptr get_learning_rate_scheduler( @@ -150,6 +114,32 @@ std::unique_ptr get_learning_rate_scheduler( * Solver Parser. * This class is designed to parse the solver clause of the configure file. */ +struct SolverParser { + // std::string configure_file; + unsigned long long seed; /**< seed of data simulator */ + LrPolicy_t lr_policy; /**< the only fixed lr is supported now. */ + int display; /**< the interval of loss display. */ + int max_iter; /**< the number of iterations for training */ + int num_epochs; /**< the number of epochs for training */ + int snapshot; /**< the number of iterations for a snapshot */ + std::string snapshot_prefix; /**< naming prefix of snapshot file */ + int eval_interval; /**< the interval of evaluations */ + int eval_batches; /**< the number of batches for evaluations */ + int batchsize_eval; /**< batchsize for eval */ + int batchsize; /**< batchsize */ + std::string model_file; /**< name of model file */ + std::vector embedding_files; /**< name of embedding file */ + std::vector> vvgpu; /**< device map */ + bool use_mixed_precision; + float scaler; + std::map metrics_spec; + bool i64_input_key; + bool use_algorithm_search; + bool use_cuda_graph; + SolverParser(const std::string& file); + SolverParser(){} +}; + template struct SparseInput { @@ -196,49 +186,6 @@ struct SparseInput { } \ } while (0) -const std::map LAYER_TYPE_MAP = { - {"BatchNorm", Layer_t::BatchNorm}, - {"BinaryCrossEntropyLoss", Layer_t::BinaryCrossEntropyLoss}, - {"Concat", Layer_t::Concat}, - {"CrossEntropyLoss", Layer_t::CrossEntropyLoss}, - {"Dropout", Layer_t::Dropout}, - {"ELU", Layer_t::ELU}, - {"InnerProduct", Layer_t::InnerProduct}, - {"Interaction", Layer_t::Interaction}, - {"MultiCrossEntropyLoss", Layer_t::MultiCrossEntropyLoss}, - {"ReLU", Layer_t::ReLU}, - {"Reshape", Layer_t::Reshape}, - {"Sigmoid", Layer_t::Sigmoid}, - {"Slice", Layer_t::Slice}, - {"Multiply", Layer_t::Multiply}, - {"FmOrder2", Layer_t::FmOrder2}, - {"Add", Layer_t::Add}, - {"ReduceSum", Layer_t::ReduceSum}, - {"MultiCross", Layer_t::MultiCross}, - {"DotProduct", Layer_t::DotProduct}}; -const std::map LAYER_TYPE_MAP_MP = { - {"BinaryCrossEntropyLoss", Layer_t::BinaryCrossEntropyLoss}, - {"Concat", Layer_t::Concat}, - {"Cast", Layer_t::Cast}, - {"InnerProduct", Layer_t::InnerProduct}, - {"FusedInnerProduct", Layer_t::FusedInnerProduct}, - {"Interaction", Layer_t::Interaction}, - {"Reshape", Layer_t::Reshape}, - {"Sigmoid", Layer_t::Sigmoid}, - {"Slice", Layer_t::Slice}, - {"ReLU", Layer_t::ReLU}, - {"Dropout", Layer_t::Dropout}, - {"Add", Layer_t::Add}}; -const std::map EMBEDDING_TYPE_MAP = { - {"DistributedSlotSparseEmbeddingHash", Embedding_t::DistributedSlotSparseEmbeddingHash}, - {"LocalizedSlotSparseEmbeddingHash", Embedding_t::LocalizedSlotSparseEmbeddingHash}, - {"LocalizedSlotSparseEmbeddingOneHot", Embedding_t::LocalizedSlotSparseEmbeddingOneHot}}; -const std::map INITIALIZER_TYPE_MAP = { - {"Uniform", Initializer_t::Uniform}, - {"XavierNorm", Initializer_t::XavierNorm}, - {"XavierUniform", Initializer_t::XavierUniform}, - {"Zero", Initializer_t::Zero}}; - static const std::map OPTIMIZER_TYPE_MAP = { {"Adam", Optimizer_t::Adam}, {"MomentumSGD", Optimizer_t::MomentumSGD}, @@ -246,7 +193,9 @@ static const std::map OPTIMIZER_TYPE_MAP = { {"SGD", Optimizer_t::SGD}}; static const std::map UPDATE_TYPE_MAP = { - {"Local", Update_t::Local}, {"Global", Update_t::Global}, {"LazyGlobal", Update_t::LazyGlobal}}; + {"Local", Update_t::Local}, + {"Global", Update_t::Global}, + {"LazyGlobal", Update_t::LazyGlobal}}; static const std::map REGULARIZER_TYPE_MAP = { {"L1", Regularizer_t::L1}, @@ -286,40 +235,11 @@ inline T get_value_from_json_soft(const nlohmann::json& json, const std::string } } -template -struct get_optimizer_param { - OptParams operator()(const nlohmann::json& j_optimizer); -}; - -template -struct create_embedding { - void operator()(std::map>& sparse_input_map, - std::vector* tensor_entries_list, - std::vector>& embedding, Embedding_t embedding_type, - const nlohmann::json& config, - const std::shared_ptr& resource_manager, size_t batch_size, - size_t batch_size_eval, bool use_mixed_precision, float scaler, - const nlohmann::json& j_layers); - - void operator()(const InferenceParser& inference_parser, const nlohmann::json& j_layers_array, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector* tensor_entries, - std::vector>* embeddings, - const std::shared_ptr gpu_resource, - std::shared_ptr>& blobs_buff); -}; - -template -struct create_datareader { - void operator()(const nlohmann::json& j, - std::map>& sparse_input_map, - std::vector* tensor_entries_list, - std::shared_ptr& data_reader, - std::shared_ptr& data_reader_eval, size_t batch_size, - size_t batch_size_eval, bool use_mixed_precision, bool repeat_dataset, - const std::shared_ptr resource_manager); -}; +void parse_data_layer_helper(const nlohmann::json& j, int& label_dim, int& dense_dim, + Check_t& check_type, std::string& source_data, + std::vector& data_reader_sparse_param_array, + std::string& eval_source, std::string& top_strs_label, + std::string& top_strs_dense, std::vector& sparse_names, + std::map>& sparse_input_map); } // namespace HugeCTR diff --git a/HugeCTR/src/inference/embedding_cache.cu b/HugeCTR/src/inference/embedding_cache.cu index 08a6590e7b..7c3b84c2e1 100644 --- a/HugeCTR/src/inference/embedding_cache.cu +++ b/HugeCTR/src/inference/embedding_cache.cu @@ -17,6 +17,8 @@ #include namespace HugeCTR { +// Temp interface, should be delete later +nlohmann::json read_json_file(const std::string& filename); // Kernels to combine the value buffer __global__ void merge_emb_vec( diff --git a/HugeCTR/src/inference/parameter_server.cpp b/HugeCTR/src/inference/parameter_server.cpp index af2d97b935..aa6cddc19c 100644 --- a/HugeCTR/src/inference/parameter_server.cpp +++ b/HugeCTR/src/inference/parameter_server.cpp @@ -17,6 +17,8 @@ #include namespace HugeCTR { +// Temp interface, should be delete later +nlohmann::json read_json_file(const std::string& filename); template parameter_server::parameter_server(const std::string& framework_name, diff --git a/HugeCTR/src/parser.cpp b/HugeCTR/src/parser.cpp index 6be1b4dc9c..aa5015bcc7 100644 --- a/HugeCTR/src/parser.cpp +++ b/HugeCTR/src/parser.cpp @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -53,55 +52,1152 @@ namespace HugeCTR { -nlohmann::json read_json_file(const std::string& filename) { - nlohmann::json config; - std::ifstream file_stream(filename); - if (!file_stream.is_open()) { - CK_THROW_(Error_t::FileCannotOpen, "file_stream.is_open() failed: " + filename); +struct InputOutputInfo { + std::vector train_input; + std::vector evaluate_input; + std::vector output; +}; + +static bool get_tensor_from_entries(const std::vector tensor_entries, + const std::string& name, TensorUse use, TensorBag2* bag) { + if (use == TensorUse::General) { + CK_THROW_(Error_t::WrongInput, "Type should not be general"); + } + for (const TensorEntry& entry : tensor_entries) { + if (entry.name == name && (entry.use == TensorUse::General || entry.use == use)) { + *bag = entry.bag; + return true; + } + } + return false; +} + +static std::vector get_layer_names(const nlohmann::json& json) { + std::vector layer_names; + if (json.is_array()) { + for (auto j : json) { + layer_names.push_back(j.get()); + } + } else { + layer_names.push_back(json.get()); + } + + return layer_names; +} + +static InputOutputInfo get_input_tensor_and_output_name( + const nlohmann::json& json, const std::vector& tensor_entries) { + auto bottom = get_json(json, "bottom"); + std::vector bottom_strs = get_layer_names(bottom); + + auto top = get_json(json, "top"); + std::vector top_strs = get_layer_names(top); + + std::vector bottom_train_tensors; + std::vector bottom_evaluate_tensors; + + for (auto& bstr : bottom_strs) { + for (auto& tstr : top_strs) { + if (bstr == tstr) { + CK_THROW_(Error_t::WrongInput, "bottom and top include a same layer name"); + } + } + TensorBag2 tensor; + if (!get_tensor_from_entries(tensor_entries, bstr, TensorUse::Train, &tensor)) { + CK_THROW_(Error_t::WrongInput, "No such bottom: " + bstr); + } + bottom_train_tensors.push_back(tensor); + if (!get_tensor_from_entries(tensor_entries, bstr, TensorUse::Evaluate, &tensor)) { + CK_THROW_(Error_t::WrongInput, "No such bottom: " + bstr); + } + bottom_evaluate_tensors.push_back(tensor); } - file_stream >> config; - file_stream.close(); - return config; + return {bottom_train_tensors, bottom_evaluate_tensors, top_strs}; +} + +struct TensorPair { + TensorBag2 tensor; + std::string name; +}; + +static void add_tensor_to_network(TensorPair& output_tensor_pair, + std::vector& tensor_entries) { + tensor_entries.push_back( + {output_tensor_pair.name, TensorUse::General, output_tensor_pair.tensor}); } -Parser::Parser(const std::string& configure_file, size_t batch_size, size_t batch_size_eval, - bool repeat_dataset, bool i64_input_key, bool use_mixed_precision, float scaler, - bool use_algorithm_search, bool use_cuda_graph) - : config_(read_json_file(configure_file)), - batch_size_(batch_size), - batch_size_eval_(batch_size_eval), - repeat_dataset_(repeat_dataset), - i64_input_key_(i64_input_key), - use_mixed_precision_(use_mixed_precision), - scaler_(scaler), - use_algorithm_search_(use_algorithm_search), - use_cuda_graph_(use_cuda_graph) {} - -Parser::Parser(const nlohmann::json& config) - : config_(config), - batch_size_(1), - batch_size_eval_(1), - repeat_dataset_(false), - i64_input_key_(false), - use_mixed_precision_(false), - scaler_(1.0f), - use_algorithm_search_(true), - use_cuda_graph_(true) {} +template +static OptParams get_optimizer_param(const nlohmann::json& j_optimizer) { + // create optimizer + auto optimizer_name = get_value_from_json(j_optimizer, "type"); + Optimizer_t optimizer_type; + if (!find_item_in_map(optimizer_type, optimizer_name, OPTIMIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such optimizer: " + optimizer_name); + } + + OptHyperParams opt_hyper_params; + memset(&opt_hyper_params, 0, sizeof(opt_hyper_params)); + OptParams opt_params; + + Update_t update_type = Update_t::Local; + if (has_key_(j_optimizer, "update_type")) { + std::string update_name = get_value_from_json(j_optimizer, "update_type"); + if (!find_item_in_map(update_type, update_name, UPDATE_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such update type: " + update_name); + } + } else if (has_key_(j_optimizer, "global_update")) { + bool global_update = get_value_from_json(j_optimizer, "global_update"); + if (global_update) update_type = Update_t::Global; + } else { + MESSAGE_("update_type is not specified, using default: local"); + } + + switch (optimizer_type) { + case Optimizer_t::Adam: { + auto j_hparam = get_json(j_optimizer, "adam_hparam"); + float learning_rate = get_value_from_json(j_hparam, "learning_rate"); + float beta1 = get_value_from_json(j_hparam, "beta1"); + float beta2 = get_value_from_json(j_hparam, "beta2"); + float epsilon = get_value_from_json(j_hparam, "epsilon"); + opt_hyper_params.adam.beta1 = beta1; + opt_hyper_params.adam.beta2 = beta2; + opt_hyper_params.adam.epsilon = epsilon; + opt_params = {Optimizer_t::Adam, learning_rate, opt_hyper_params, update_type}; + break; + } + case Optimizer_t::MomentumSGD: { + auto j_hparam = get_json(j_optimizer, "momentum_sgd_hparam"); + float learning_rate = get_value_from_json(j_hparam, "learning_rate"); + float momentum_factor = get_value_from_json(j_hparam, "momentum_factor"); + opt_hyper_params.momentum.factor = momentum_factor; + opt_params = {Optimizer_t::MomentumSGD, learning_rate, opt_hyper_params, update_type}; + break; + } + case Optimizer_t::Nesterov: { + auto j_hparam = get_json(j_optimizer, "nesterov_hparam"); + float learning_rate = get_value_from_json(j_hparam, "learning_rate"); + float momentum_factor = get_value_from_json(j_hparam, "momentum_factor"); + opt_hyper_params.nesterov.mu = momentum_factor; + opt_params = {Optimizer_t::Nesterov, learning_rate, opt_hyper_params, update_type}; + break; + } + case Optimizer_t::SGD: { + auto j_hparam = get_json(j_optimizer, "sgd_hparam"); + auto learning_rate = get_value_from_json(j_hparam, "learning_rate"); + if (has_key_(j_hparam, "atomic_update")) { + opt_hyper_params.sgd.atomic_update = get_value_from_json(j_hparam, "atomic_update"); + } + opt_params = {Optimizer_t::SGD, learning_rate, opt_hyper_params, update_type}; + break; + } + default: + assert(!"Error: no such optimizer && should never get here!"); + } + return opt_params; +} + +template +static std::shared_ptr> create_regularizer( + const nlohmann::json& j, const Tensor2& weight_buff, const Tensor2& wgrad_buff, + const int batch_size, const std::shared_ptr& gpu_resource) { + std::shared_ptr> reg( + new NoRegularizer(weight_buff, wgrad_buff, batch_size, gpu_resource)); + auto reg_it = j.find("regularizer"); + if (reg_it != j.end()) { + Regularizer_t reg_type; + auto reg_name = reg_it->get(); + if (!find_item_in_map(reg_type, reg_name, REGULARIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such regularizer: " + reg_name); + } + switch (reg_type) { + case Regularizer_t::L1: { + const auto lambda = get_value_from_json(j, "lambda"); + reg.reset(new L1Regularizer(weight_buff, wgrad_buff, batch_size, lambda, gpu_resource)); + break; + } + case Regularizer_t::L2: { + const auto lambda = get_value_from_json(j, "lambda"); + reg.reset(new L2Regularizer(weight_buff, wgrad_buff, batch_size, lambda, gpu_resource)); + break; + } + default: { assert(!"Error: no such regularizer!"); } + } + } + return reg; +} + +const std::map LAYER_TYPE_MAP = { + {"BatchNorm", Layer_t::BatchNorm}, + {"BinaryCrossEntropyLoss", Layer_t::BinaryCrossEntropyLoss}, + {"Concat", Layer_t::Concat}, + {"CrossEntropyLoss", Layer_t::CrossEntropyLoss}, + {"Dropout", Layer_t::Dropout}, + {"ELU", Layer_t::ELU}, + {"InnerProduct", Layer_t::InnerProduct}, + {"Interaction", Layer_t::Interaction}, + {"MultiCrossEntropyLoss", Layer_t::MultiCrossEntropyLoss}, + {"ReLU", Layer_t::ReLU}, + {"Reshape", Layer_t::Reshape}, + {"Slice", Layer_t::Slice}, + {"Multiply", Layer_t::Multiply}, + {"FmOrder2", Layer_t::FmOrder2}, + {"Add", Layer_t::Add}, + {"ReduceSum", Layer_t::ReduceSum}, + {"MultiCross", Layer_t::MultiCross}, + {"DotProduct", Layer_t::DotProduct}}; +const std::map LAYER_TYPE_MAP_MP = { + {"BinaryCrossEntropyLoss", Layer_t::BinaryCrossEntropyLoss}, + {"Concat", Layer_t::Concat}, + {"Cast", Layer_t::Cast}, + {"InnerProduct", Layer_t::InnerProduct}, + {"FusedInnerProduct", Layer_t::FusedInnerProduct}, + {"Interaction", Layer_t::Interaction}, + {"Reshape", Layer_t::Reshape}, + {"Slice", Layer_t::Slice}, + {"ReLU", Layer_t::ReLU}, + {"Dropout", Layer_t::Dropout}, + {"Add", Layer_t::Add}}; +const std::map EMBEDDING_TYPE_MAP = { + {"DistributedSlotSparseEmbeddingHash", Embedding_t::DistributedSlotSparseEmbeddingHash}, + {"LocalizedSlotSparseEmbeddingHash", Embedding_t::LocalizedSlotSparseEmbeddingHash}, + {"LocalizedSlotSparseEmbeddingOneHot", Embedding_t::LocalizedSlotSparseEmbeddingOneHot}}; +const std::map INITIALIZER_TYPE_MAP = { + {"Uniform", Initializer_t::Uniform}, + {"XavierNorm", Initializer_t::XavierNorm}, + {"XavierUniform", Initializer_t::XavierUniform}, + {"Zero", Initializer_t::Zero}}; + +/* + * Create single network + * + */ +Network* create_network(const nlohmann::json& j_array, const nlohmann::json& j_optimizer, + std::vector& tensor_entries, int num_networks_in_global, + const std::shared_ptr& cpu_resource, + const std::shared_ptr& gpu_resource, bool use_mixed_precision, + float scaler, bool use_algorithm_search, bool use_cuda_graph) { + std::unique_ptr network( + new Network(cpu_resource, gpu_resource, use_mixed_precision, use_cuda_graph)); + + auto& layers = network->layers_; + auto& loss_tensor = network->loss_tensor_; + auto& loss = network->loss_; + + std::shared_ptr> blobs_buff = + GeneralBuffer2::create(); + + std::shared_ptr> weight_buff = blobs_buff->create_block(); + std::shared_ptr> weight_buff_half = blobs_buff->create_block<__half>(); + std::shared_ptr> wgrad_buff = blobs_buff->create_block(); + std::shared_ptr> wgrad_buff_half = blobs_buff->create_block<__half>(); + + assert(layers.empty()); + + for (unsigned int i = 1; i < j_array.size(); i++) { + const nlohmann::json& j = j_array[i]; + const auto layer_type_name = get_value_from_json(j, "type"); + Layer_t layer_type; + + const auto& layer_map = use_mixed_precision ? LAYER_TYPE_MAP_MP : LAYER_TYPE_MAP; + + if (!find_item_in_map(layer_type, layer_type_name, layer_map)) { + Embedding_t embedding_type; + if (!find_item_in_map(embedding_type, layer_type_name, EMBEDDING_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such layer: " + layer_type_name); + } + continue; + } + + std::vector output_tensor_pairs; + auto input_output_info = get_input_tensor_and_output_name(j, tensor_entries); + switch (layer_type) { + case Layer_t::BatchNorm: { + Tensor2 bn_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + // establish out tensor + Tensor2 bn_out_tensor; + blobs_buff->reserve(bn_in_tensor.get_dimensions(), &bn_out_tensor); + output_tensor_pairs.push_back({bn_out_tensor.shrink(), input_output_info.output[0]}); + + // get BN params + auto j_bn_hparam = get_json(j, "bn_param"); + auto factor = get_value_from_json(j_bn_hparam, "factor"); + auto eps = get_value_from_json(j_bn_hparam, "eps"); + // establish initializer + std::vector initializer_types(2, Initializer_t::Default); + if (has_key_(j_bn_hparam, "gamma_init")) { + const auto gamma_init_name = get_value_from_json(j_bn_hparam, "gamma_init"); + Initializer_t gamma_init_type; + if (!find_item_in_map(gamma_init_type, gamma_init_name, INITIALIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such initializer: " + gamma_init_name); + } else { + initializer_types[0] = gamma_init_type; + } + } + if (has_key_(j_bn_hparam, "beta_init")) { + const auto beta_init_name = get_value_from_json(j_bn_hparam, "beta_init"); + Initializer_t beta_init_type; + if (!find_item_in_map(beta_init_type, beta_init_name, INITIALIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such initializer: " + beta_init_name); + } else { + initializer_types[1] = beta_init_type; + } + } + + BatchNormLayer::Params params = {factor, eps}; + layers.emplace_back(new BatchNormLayer(weight_buff, wgrad_buff, blobs_buff, bn_in_tensor, + bn_out_tensor, params, gpu_resource, + initializer_types)); + break; + } + case Layer_t::BinaryCrossEntropyLoss: { + if (input_output_info.train_input.size() != 2 || + input_output_info.evaluate_input.size() != 2) { + CK_THROW_(Error_t::WrongInput, "bottom of BinaryCrossEntropyLoss must be two dim"); + } + Tensor2 train_label_tensor = + Tensor2::stretch_from(input_output_info.train_input[1]); + Tensor2 evaluate_label_tensor = + Tensor2::stretch_from(input_output_info.evaluate_input[1]); + blobs_buff->reserve({1, 1}, &loss_tensor); + if (use_mixed_precision) { + Tensor2<__half> train_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + Tensor2<__half> evaluate_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); + + loss.reset(new BinaryCrossEntropyLoss<__half>( + train_label_tensor, train_in_tensor, evaluate_label_tensor, evaluate_in_tensor, + loss_tensor, + create_regularizer(j, weight_buff->as_tensor(), wgrad_buff_half->as_tensor(), + train_in_tensor.get_dimensions()[0], gpu_resource), + gpu_resource, num_networks_in_global, scaler)); + } else { + Tensor2 train_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 evaluate_in_tensor = + Tensor2::stretch_from(input_output_info.evaluate_input[0]); + + loss.reset(new BinaryCrossEntropyLoss( + train_label_tensor, train_in_tensor, evaluate_label_tensor, evaluate_in_tensor, + loss_tensor, + create_regularizer(j, weight_buff->as_tensor(), wgrad_buff->as_tensor(), + train_in_tensor.get_dimensions()[0], gpu_resource), + gpu_resource, num_networks_in_global, scaler)); + } + break; + } + case Layer_t::Concat: { + if (use_mixed_precision) { + Tensors2<__half> train_in_tensors; + for (const TensorBag2& t : input_output_info.train_input) { + train_in_tensors.push_back(Tensor2<__half>::stretch_from(t)); + } + Tensors2<__half> evaluate_in_tensors; + for (const TensorBag2& t : input_output_info.evaluate_input) { + evaluate_in_tensors.push_back(Tensor2<__half>::stretch_from(t)); + } + Tensor2<__half> out_tensor; + layers.emplace_back(new ConcatLayer<__half>(train_in_tensors, evaluate_in_tensors, + out_tensor, blobs_buff, gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + } else { + Tensors2 train_in_tensors; + for (const TensorBag2& t : input_output_info.train_input) { + train_in_tensors.push_back(Tensor2::stretch_from(t)); + } + Tensors2 evaluate_in_tensors; + for (const TensorBag2& t : input_output_info.evaluate_input) { + evaluate_in_tensors.push_back(Tensor2::stretch_from(t)); + } + Tensor2 out_tensor; + layers.emplace_back(new ConcatLayer(train_in_tensors, evaluate_in_tensors, + out_tensor, blobs_buff, gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + } + break; + } + case Layer_t::CrossEntropyLoss: { + if (input_output_info.train_input.size() != 2) { + CK_THROW_(Error_t::WrongInput, "bottom of CrossEntropyLoss must be two dim"); + } + Tensor2 label_tensor = + Tensor2::stretch_from(input_output_info.train_input[1]); + blobs_buff->reserve({1, 1}, &loss_tensor); + if (use_mixed_precision) { + Tensor2<__half> cross_entropy_loss_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + + loss.reset(new CrossEntropyLoss<__half>( + label_tensor, cross_entropy_loss_in_tensor, loss_tensor, + create_regularizer(j, weight_buff->as_tensor(), wgrad_buff_half->as_tensor(), + cross_entropy_loss_in_tensor.get_dimensions()[0], gpu_resource), + gpu_resource, num_networks_in_global, scaler)); + } else { + Tensor2 cross_entropy_loss_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + + loss.reset(new CrossEntropyLoss( + label_tensor, cross_entropy_loss_in_tensor, loss_tensor, + create_regularizer(j, weight_buff->as_tensor(), wgrad_buff->as_tensor(), + cross_entropy_loss_in_tensor.get_dimensions()[0], gpu_resource), + gpu_resource, num_networks_in_global, scaler)); + } + break; + } + case Layer_t::Dropout: { + if (use_mixed_precision) { + Tensor2<__half> do_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + // establish out tensor + Tensor2<__half> do_out_tensor; + blobs_buff->reserve(do_in_tensor.get_dimensions(), &do_out_tensor); + output_tensor_pairs.push_back({do_out_tensor.shrink(), input_output_info.output[0]}); + // get ELU params + auto rate_it = j.find("rate"); + auto rate = (rate_it != j.end()) ? rate_it->get() : 0.5f; +#ifndef PREFER_CUDNN + layers.emplace_back(new DropoutLayer<__half>(do_in_tensor, do_out_tensor, blobs_buff, + rate, gpu_resource)); +#else + layers.emplace_back(new DropoutCudnnLayer<__half>(do_in_tensor, do_out_tensor, blobs_buff, + rate, gpu_resource)); +#endif + } else { + // establish out tensor + Tensor2 do_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 do_out_tensor; + blobs_buff->reserve(do_in_tensor.get_dimensions(), &do_out_tensor); + output_tensor_pairs.push_back({do_out_tensor.shrink(), input_output_info.output[0]}); + // get ELU params + auto rate_it = j.find("rate"); + auto rate = (rate_it != j.end()) ? rate_it->get() : 0.5f; +#ifndef PREFER_CUDNN + layers.emplace_back( + new DropoutLayer(do_in_tensor, do_out_tensor, blobs_buff, rate, gpu_resource)); +#else + layers.emplace_back(new DropoutCudnnLayer(do_in_tensor, do_out_tensor, blobs_buff, + rate, gpu_resource)); +#endif + } + network->enable_cuda_graph_ = false; + + break; + } + case Layer_t::ELU: { + Tensor2 elu_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + + // establish out tensor + Tensor2 elu_out_tensor; + blobs_buff->reserve(elu_in_tensor.get_dimensions(), &elu_out_tensor); + output_tensor_pairs.push_back({elu_out_tensor.shrink(), input_output_info.output[0]}); + // get ELU params + auto j_elu_hparam = get_json(j, "elu_param"); + auto alpha = get_value_from_json(j_elu_hparam, "alpha"); + layers.emplace_back(new EluLayer(elu_in_tensor, elu_out_tensor, alpha, gpu_resource)); + + break; + } + + case Layer_t::FusedInnerProduct: { + auto j_fc_param = get_json(j, "fc_param"); + // establish initializer + std::vector initializer_types(2, Initializer_t::Default); + if (has_key_(j_fc_param, "weight_init")) { + const auto weight_init_name = get_value_from_json(j_fc_param, "weight_init"); + Initializer_t weight_init_type; + if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such initializer: " + weight_init_name); + } else { + initializer_types[0] = weight_init_type; + } + } + if (has_key_(j_fc_param, "bias_init")) { + const auto bias_init_name = get_value_from_json(j_fc_param, "bias_init"); + Initializer_t bias_init_type; + if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such initializer: " + bias_init_name); + } else { + initializer_types[1] = bias_init_type; + } + } + // establish out tensor + auto output = get_value_from_json(j_fc_param, "num_output"); + if (use_mixed_precision) { + Tensor2<__half> train_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + Tensor2<__half> evaluate_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); + Tensor2<__half> fc_out_tensor; + blobs_buff->reserve({(train_in_tensor.get_dimensions())[0], output}, &fc_out_tensor); + output_tensor_pairs.push_back({fc_out_tensor.shrink(), input_output_info.output[0]}); + + // establish layer + layers.emplace_back(new FusedFullyConnectedLayer( + weight_buff, weight_buff_half, wgrad_buff_half, blobs_buff, train_in_tensor, + evaluate_in_tensor, fc_out_tensor, gpu_resource, initializer_types)); + } else { + CK_THROW_(Error_t::WrongInput, "FusedInnerProduct support half only"); + } + break; + } + + case Layer_t::Cast: { + if (use_mixed_precision) { + Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2<__half> out_tensor; + blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + layers.emplace_back(new CastLayer(in_tensor, out_tensor, gpu_resource)); + } else { + CK_THROW_(Error_t::WrongInput, "Cast supports half only"); + } + break; + } + + case Layer_t::InnerProduct: { + auto j_fc_param = get_json(j, "fc_param"); + // establish initializer + std::vector initializer_types(2, Initializer_t::Default); + if (has_key_(j_fc_param, "weight_init")) { + const auto weight_init_name = get_value_from_json(j_fc_param, "weight_init"); + Initializer_t weight_init_type; + if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such initializer: " + weight_init_name); + } else { + initializer_types[0] = weight_init_type; + } + } + if (has_key_(j_fc_param, "bias_init")) { + const auto bias_init_name = get_value_from_json(j_fc_param, "bias_init"); + Initializer_t bias_init_type; + if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such initializer: " + bias_init_name); + } else { + initializer_types[1] = bias_init_type; + } + } + + // establish out tensor + auto output = get_value_from_json(j_fc_param, "num_output"); + + if (use_mixed_precision) { + Tensor2<__half> train_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + Tensor2<__half> evaluate_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); + Tensor2<__half> fc_out_tensor; + blobs_buff->reserve({train_in_tensor.get_dimensions()[0], output}, &fc_out_tensor); + + // establish layer + layers.emplace_back(new FullyConnectedLayerHalf( + weight_buff, weight_buff_half, wgrad_buff_half, blobs_buff, train_in_tensor, + evaluate_in_tensor, fc_out_tensor, gpu_resource, initializer_types)); + output_tensor_pairs.push_back({fc_out_tensor.shrink(), input_output_info.output[0]}); + } else { + Tensor2 train_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 evaluate_in_tensor = + Tensor2::stretch_from(input_output_info.evaluate_input[0]); + Tensor2 fc_out_tensor; + blobs_buff->reserve({train_in_tensor.get_dimensions()[0], output}, &fc_out_tensor); + // establish layer + layers.emplace_back(new FullyConnectedLayer( + weight_buff, wgrad_buff, train_in_tensor, evaluate_in_tensor, fc_out_tensor, + gpu_resource, use_mixed_precision, initializer_types)); + output_tensor_pairs.push_back({fc_out_tensor.shrink(), input_output_info.output[0]}); + } + break; + } + + case Layer_t::Interaction: { + // lambda template could be a better solution here, but there's not support in c++11 + if (use_mixed_precision) { + if (gpu_resource->get_cc_major() < 7) { + CK_THROW_(Error_t::WrongInput, "InteractionLayer<__half> is not supported in SM " + + std::to_string(gpu_resource->get_cc_major()) + "." + + std::to_string(gpu_resource->get_cc_minor())); + } + + Tensor2<__half> train_in_mlp_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + Tensor2<__half> evaluate_in_mlp_tensor = + Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); + Tensor2<__half> train_in_emb_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[1]); + Tensor2<__half> evaluate_in_emb_tensor = + Tensor2<__half>::stretch_from(input_output_info.evaluate_input[1]); + Tensor2<__half> out_tensor; + + layers.emplace_back(new InteractionLayer<__half>( + train_in_mlp_tensor, evaluate_in_mlp_tensor, train_in_emb_tensor, + evaluate_in_emb_tensor, out_tensor, + blobs_buff, // todo cannot use this blobs_buff here need half + gpu_resource, use_mixed_precision)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + + } else { + Tensor2 train_in_mlp_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 evaluate_in_mlp_tensor = + Tensor2::stretch_from(input_output_info.evaluate_input[0]); + Tensor2 train_emb_tensor = + Tensor2::stretch_from(input_output_info.train_input[1]); + Tensor2 evaluate_emb_tensor = + Tensor2::stretch_from(input_output_info.evaluate_input[1]); + Tensor2 out_tensor; + layers.emplace_back(new InteractionLayer( + train_in_mlp_tensor, evaluate_in_mlp_tensor, train_emb_tensor, evaluate_emb_tensor, + out_tensor, blobs_buff, gpu_resource, use_mixed_precision)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + } + + break; + } + case Layer_t::MultiCross: { + auto j_mc_param = get_json(j, "mc_param"); + // establish initializer + std::vector initializer_types(2, Initializer_t::Default); + if (has_key_(j_mc_param, "weight_init")) { + const auto weight_init_name = get_value_from_json(j_mc_param, "weight_init"); + Initializer_t weight_init_type; + if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such initializer: " + weight_init_name); + } else { + initializer_types[0] = weight_init_type; + } + } + if (has_key_(j_mc_param, "bias_init")) { + const auto bias_init_name = get_value_from_json(j_mc_param, "bias_init"); + Initializer_t bias_init_type; + if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such initializer: " + bias_init_name); + } else { + initializer_types[1] = bias_init_type; + } + } + + // establish out tensor + auto num_layers = get_value_from_json(j_mc_param, "num_layers"); + Tensor2 mc_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 out_tensor; + blobs_buff->reserve(mc_in_tensor.get_dimensions(), &out_tensor); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + // establish layer + layers.emplace_back(new MultiCrossLayer(weight_buff, wgrad_buff, blobs_buff, mc_in_tensor, + out_tensor, gpu_resource, num_layers, + initializer_types)); + break; + } + + case Layer_t::MultiCrossEntropyLoss: { + if (input_output_info.train_input.size() != 2) { + CK_THROW_(Error_t::WrongInput, "bottom of MultiCrossEntropyLoss must be two dim"); + } + + auto tweight = get_json(j, "target_weight"); + std::vector target_weight_vec; + for (auto tweight_tmp : tweight) { + float tweight_val = tweight_tmp.get(); + target_weight_vec.push_back(tweight_val); + } + + Tensor2 label_tensor = + Tensor2::stretch_from(input_output_info.train_input[1]); + blobs_buff->reserve({1, 1}, &loss_tensor); + + if (use_mixed_precision) { + Tensor2<__half> multi_cross_entropy_loss_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + loss.reset(new MultiCrossEntropyLoss<__half>( + label_tensor, multi_cross_entropy_loss_in_tensor, loss_tensor, + create_regularizer(j, weight_buff->as_tensor(), wgrad_buff_half->as_tensor(), + multi_cross_entropy_loss_in_tensor.get_dimensions()[0], + gpu_resource), + target_weight_vec, gpu_resource, num_networks_in_global, scaler)); + } else { + Tensor2 multi_cross_entropy_loss_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + loss.reset(new MultiCrossEntropyLoss( + label_tensor, multi_cross_entropy_loss_in_tensor, loss_tensor, + create_regularizer(j, weight_buff->as_tensor(), wgrad_buff->as_tensor(), + multi_cross_entropy_loss_in_tensor.get_dimensions()[0], + gpu_resource), + target_weight_vec, gpu_resource, num_networks_in_global, scaler)); + } + break; + } + case Layer_t::ReLU: { + if (use_mixed_precision) { + Tensor2<__half> relu_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + Tensor2<__half> relu_out_tensor; + blobs_buff->reserve(relu_in_tensor.get_dimensions(), &relu_out_tensor); + layers.emplace_back(new ReluLayer<__half>(relu_in_tensor, relu_out_tensor, gpu_resource)); + output_tensor_pairs.push_back({relu_out_tensor.shrink(), input_output_info.output[0]}); + } else { + // establish out tensor + Tensor2 relu_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 relu_out_tensor; + blobs_buff->reserve(relu_in_tensor.get_dimensions(), &relu_out_tensor); + layers.emplace_back(new ReluLayer(relu_in_tensor, relu_out_tensor, gpu_resource)); + output_tensor_pairs.push_back({relu_out_tensor.shrink(), input_output_info.output[0]}); + } + + break; + } + case Layer_t::Reshape: { + auto selected_it = j.find("selected"); + // selective reshape + if (selected_it != j.end()) { + std::vector selected; + nlohmann::json j_selected = (selected_it.value()); + for (auto slot_obj : j_selected) { + int slot_id = slot_obj.get(); + if (slot_id < 0) CK_THROW_(Error_t::WrongInput, "slot_id < 0"); + selected.push_back(slot_id); + } + + if (use_mixed_precision) { + Tensor2<__half> in_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + Tensor2<__half> out_tensor; + layers.emplace_back(new ReshapeLayer<__half>(in_tensor, out_tensor, blobs_buff, + selected, gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + } else { + Tensor2 in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 out_tensor; + layers.emplace_back( + new ReshapeLayer(in_tensor, out_tensor, blobs_buff, selected, gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + } + } + // general purpose reshape + else { + auto leading_dim_it = j.find("leading_dim"); + + // if leading_dim is not specified, default leading_dim = n_slots * vector_length + + if (use_mixed_precision) { + Tensor2<__half> train_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + Tensor2<__half> evaluate_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); + Tensor2<__half> out_tensor; + const auto& in_dims = train_in_tensor.get_dimensions(); + size_t leading_dim = (leading_dim_it != j.end()) + ? (*leading_dim_it).get() + : train_in_tensor.get_num_elements() / in_dims[0]; + layers.emplace_back(new ReshapeLayer<__half>(train_in_tensor, evaluate_in_tensor, + out_tensor, blobs_buff, leading_dim, + gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + } else { + Tensor2 train_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 evaluate_in_tensor = + Tensor2::stretch_from(input_output_info.evaluate_input[0]); + Tensor2 out_tensor; + const auto& in_dims = train_in_tensor.get_dimensions(); + size_t leading_dim = (leading_dim_it != j.end()) + ? (*leading_dim_it).get() + : train_in_tensor.get_num_elements() / in_dims[0]; + layers.emplace_back(new ReshapeLayer(train_in_tensor, evaluate_in_tensor, + out_tensor, blobs_buff, leading_dim, + gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + } + } + break; + } + case Layer_t::Slice: { + std::vector> ranges; + auto j_ranges = get_json(j, "ranges"); + assert(j_ranges.is_array()); + for (auto j_range : j_ranges) { + assert(j_range.is_array()); + ranges.emplace_back(std::make_pair(j_range[0].get(), j_range[1].get())); + } + + if (use_mixed_precision) { + Tensor2<__half> train_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.train_input[0]); + Tensor2<__half> evaluate_in_tensor = + Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); + Tensors2<__half> out_tensors; + layers.emplace_back(new SliceLayer<__half>( + train_in_tensor, evaluate_in_tensor, out_tensors, blobs_buff, ranges, gpu_resource)); + for (size_t i = 0; i < out_tensors.size(); i++) { + output_tensor_pairs.push_back({out_tensors[i].shrink(), input_output_info.output[i]}); + } + } else { + Tensor2 train_in_tensor = + Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 evaluate_in_tensor = + Tensor2::stretch_from(input_output_info.evaluate_input[0]); + Tensors2 out_tensors; + layers.emplace_back(new SliceLayer(train_in_tensor, evaluate_in_tensor, + out_tensors, blobs_buff, ranges, gpu_resource)); + for (size_t i = 0; i < out_tensors.size(); i++) { + output_tensor_pairs.push_back({out_tensors[i].shrink(), input_output_info.output[i]}); + } + } + break; + } + case Layer_t::Multiply: { + std::vector weight_dims; + auto dims = get_json(j, "weight_dims"); + assert(dims.is_array()); + for (auto dim : dims) { + weight_dims.emplace_back(dim.get()); + } + + // establish initializer + std::vector initializer_types(1, Initializer_t::Default); + if (has_key_(j, "weight_init")) { + const auto weight_init_name = get_value_from_json(j, "weight_init"); + Initializer_t weight_init_type; + if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such initializer: " + weight_init_name); + } else { + initializer_types[0] = weight_init_type; + } + } + + Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 out_tensor; + layers.emplace_back(new MultiplyLayer(weight_buff, wgrad_buff, blobs_buff, in_tensor, + out_tensor, weight_dims, gpu_resource, + initializer_types)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + break; + } + case Layer_t::FmOrder2: { + auto out_dim = get_json(j, "out_dim").get(); + + Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 out_tensor; + blobs_buff->reserve({in_tensor.get_dimensions()[0], out_dim}, &out_tensor); + + layers.emplace_back(new FmOrder2Layer(in_tensor, out_tensor, gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + break; + } + case Layer_t::Add: { + if (use_mixed_precision) { + Tensors2<__half> in_tensors; + for (const auto& t : input_output_info.train_input) { + in_tensors.push_back(Tensor2<__half>::stretch_from(t)); + } + Tensor2<__half> out_tensor; + blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); + layers.emplace_back( + new AddLayer<__half>(in_tensors, out_tensor, blobs_buff, gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + } else { + Tensors2 in_tensors; + for (const auto& t : input_output_info.train_input) { + in_tensors.push_back(Tensor2::stretch_from(t)); + } + Tensor2 out_tensor; + blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); + layers.emplace_back( + new AddLayer(in_tensors, out_tensor, blobs_buff, gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + } + break; + } + case Layer_t::ReduceSum: { + int axis = get_json(j, "axis").get(); + + Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.train_input[0]); + Tensor2 out_tensor; + layers.emplace_back( + new ReduceSumLayer(in_tensor, out_tensor, blobs_buff, axis, gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + break; + } + case Layer_t::DotProduct: { + Tensors2 in_tensors; + for (const auto& t : input_output_info.train_input) { + in_tensors.push_back(Tensor2::stretch_from(t)); + } + Tensor2 out_tensor; + blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); + layers.emplace_back(new DotProductLayer(in_tensors, out_tensor, blobs_buff, gpu_resource)); + output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); + break; + } + default: + assert(!"Error: no such layer && should never get here!"); + } // end of switch + + if (!(layer_type == Layer_t::CrossEntropyLoss || + layer_type == Layer_t::BinaryCrossEntropyLoss || + layer_type == Layer_t::MultiCrossEntropyLoss)) { + for (auto& output_tensor_pair : output_tensor_pairs) { + add_tensor_to_network(output_tensor_pair, tensor_entries); + } + } else { + network->raw_metrics_[metrics::RawType::Loss] = loss_tensor.shrink(); + network->raw_metrics_[metrics::RawType::Pred] = input_output_info.evaluate_input[0]; + network->raw_metrics_[metrics::RawType::Label] = input_output_info.evaluate_input[1]; + } + } // for layers + + // create optimizer + auto opt_param = get_optimizer_param(j_optimizer); + + network->optimizer_ = std::move(Optimizer::Create( + opt_param, weight_buff->as_tensor(), wgrad_buff->as_tensor(), wgrad_buff_half->as_tensor(), + use_mixed_precision, scaler, blobs_buff, gpu_resource)); + + network->weight_tensor_ = weight_buff->as_tensor(); + network->wgrad_tensor_ = wgrad_buff->as_tensor(); + network->weight_tensor_half_ = weight_buff_half->as_tensor(); + network->wgrad_tensor_half_ = wgrad_buff_half->as_tensor(); + + CudaDeviceContext context(gpu_resource->get_device_id()); + blobs_buff->allocate(); + + return network.release(); +} template -void Parser::create_pipeline_internal(std::shared_ptr& data_reader, - std::shared_ptr& data_reader_eval, - std::vector>& embedding, - std::vector>& network, - const std::shared_ptr& resource_manager) { +static void parse_data_layer(const nlohmann::json& j, int& label_dim, int& dense_dim, + Check_t& check_type, std::string& source_data, + std::vector& data_reader_sparse_param_array, + std::string& eval_source, std::string& top_strs_label, + std::string& top_strs_dense, std::vector& sparse_names, + std::map>& sparse_input_map) { + source_data = get_value_from_json(j, "source"); + + auto j_label = get_json(j, "label"); + top_strs_label = get_value_from_json(j_label, "top"); + label_dim = get_value_from_json(j_label, "label_dim"); + + auto j_dense = get_json(j, "dense"); + top_strs_dense = get_value_from_json(j_dense, "top"); + dense_dim = get_value_from_json(j_dense, "dense_dim"); + + const std::map CHECK_TYPE_MAP = {{"Sum", Check_t::Sum}, + {"None", Check_t::None}}; + + const auto check_str = get_value_from_json(j, "check"); + if (!find_item_in_map(check_type, check_str, CHECK_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "Not supported check type: " + check_str); + } + + const std::map DATA_TYPE_MAP = { + {"DistributedSlot", DataReaderSparse_t::Distributed}, + {"LocalizedSlot", DataReaderSparse_t::Localized}, + }; + + auto j_sparse = get_json(j, "sparse"); + for (unsigned int i = 0; i < j_sparse.size(); i++) { + DataReaderSparseParam param; + + const nlohmann::json& js = j_sparse[i]; + const auto sparse_name = get_value_from_json(js, "top"); + const auto data_type_name = get_value_from_json(js, "type"); + if (!find_item_in_map(param.type, data_type_name, DATA_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "Not supported data type: " + data_type_name); + } + param.max_feature_num = get_value_from_json(js, "max_feature_num_per_sample"); + param.max_nnz = get_value_from_json_soft(js, "max_nnz", param.max_feature_num); + param.slot_num = get_value_from_json(js, "slot_num"); + data_reader_sparse_param_array.push_back(param); + SparseInput sparse_input(param.slot_num, param.max_feature_num); + sparse_input_map.emplace(sparse_name, sparse_input); + sparse_names.push_back(sparse_name); + } + FIND_AND_ASSIGN_STRING_KEY(eval_source, j); +} + +void parse_data_layer_helper(const nlohmann::json& j, int& label_dim, int& dense_dim, + Check_t& check_type, std::string& source_data, + std::vector& data_reader_sparse_param_array, + std::string& eval_source, std::string& top_strs_label, + std::string& top_strs_dense, std::vector& sparse_names, + std::map>& sparse_input_map) { + parse_data_layer(j, label_dim, dense_dim, check_type, source_data, data_reader_sparse_param_array, + eval_source, top_strs_label, top_strs_dense, sparse_names, sparse_input_map); +} + +template +static void create_embedding(std::map>& sparse_input_map, + std::vector* tensor_entries_list, + std::vector>& embedding, + Embedding_t embedding_type, const nlohmann::json& config, + const std::shared_ptr& resource_manager, + size_t batch_size, size_t batch_size_eval, bool use_mixed_precision, + float scaler, const nlohmann::json& j_layers) { + + auto j_optimizer = get_json(config, "optimizer"); + auto embedding_name = get_value_from_json(j_layers, "type"); + + auto bottom_name = get_value_from_json(j_layers, "bottom"); + auto top_name = get_value_from_json(j_layers, "top"); + + auto j_hparam = get_json(j_layers, "sparse_embedding_hparam"); + size_t max_vocabulary_size_per_gpu = 0; + if (embedding_type == Embedding_t::DistributedSlotSparseEmbeddingHash) { + max_vocabulary_size_per_gpu = + get_value_from_json(j_hparam, "max_vocabulary_size_per_gpu"); + } else if (embedding_type == Embedding_t::LocalizedSlotSparseEmbeddingHash) { + if (has_key_(j_hparam, "max_vocabulary_size_per_gpu")) { + max_vocabulary_size_per_gpu = + get_value_from_json(j_hparam, "max_vocabulary_size_per_gpu"); + } else if (!has_key_(j_hparam, "slot_size_array")) { + CK_THROW_(Error_t::WrongInput, + "No max_vocabulary_size_per_gpu or slot_size_array in: " + embedding_name); + } + } + auto embedding_vec_size = get_value_from_json(j_hparam, "embedding_vec_size"); + auto combiner = get_value_from_json(j_hparam, "combiner"); + + SparseInput sparse_input; + if (!find_item_in_map(sparse_input, bottom_name, sparse_input_map)) { + CK_THROW_(Error_t::WrongInput, "Cannot find bottom"); + } + + OptParams embedding_opt_params; + if (has_key_(j_layers, "optimizer")) { + embedding_opt_params = get_optimizer_param(get_json(j_layers, "optimizer")); + } else { + embedding_opt_params = get_optimizer_param(j_optimizer); + } + embedding_opt_params.scaler = scaler; + + switch (embedding_type) { + case Embedding_t::DistributedSlotSparseEmbeddingHash: { + const SparseEmbeddingHashParams embedding_params = { + batch_size, + batch_size_eval, + max_vocabulary_size_per_gpu, + {}, + embedding_vec_size, + sparse_input.max_feature_num_per_sample, + sparse_input.slot_num, + combiner, // combiner: 0-sum, 1-mean + embedding_opt_params}; + + embedding.emplace_back(new DistributedSlotSparseEmbeddingHash( + sparse_input.train_row_offsets, sparse_input.train_values, sparse_input.train_nnz, + sparse_input.evaluate_row_offsets, sparse_input.evaluate_values, + sparse_input.evaluate_nnz, embedding_params, resource_manager)); + break; + } + case Embedding_t::LocalizedSlotSparseEmbeddingHash: { +#ifndef NCCL_A2A + + auto j_plan = get_json(j_layers, "plan_file"); + std::string plan_file; + if (j_plan.is_array()) { + int num_nodes = j_plan.size(); + if (num_nodes != resource_manager->get_num_process()) { + CK_THROW_(Error_t::WrongInput, "num_nodes != num_procs"); + } + plan_file = j_plan[resource_manager->get_process_id()].get(); + } else { + if (resource_manager->get_num_process() > 1) { + CK_THROW_(Error_t::WrongInput, "num_procs > 1"); + } + plan_file = get_value_from_json(j_layers, "plan_file"); + } + + std::ifstream ifs(plan_file); + if (!ifs) { + CK_THROW_(Error_t::WrongInput, "plan file " + plan_file + " can bot be open"); + } +#else + std::string plan_file = ""; +#endif + std::vector slot_size_array; + if (has_key_(j_hparam, "slot_size_array")) { + auto slots = get_json(j_hparam, "slot_size_array"); + assert(slots.is_array()); + for (auto slot : slots) { + slot_size_array.emplace_back(slot.get()); + } + } + + const SparseEmbeddingHashParams embedding_params = { + batch_size, + batch_size_eval, + max_vocabulary_size_per_gpu, + slot_size_array, + embedding_vec_size, + sparse_input.max_feature_num_per_sample, + sparse_input.slot_num, + combiner, // combiner: 0-sum, 1-mean + embedding_opt_params}; + + embedding.emplace_back(new LocalizedSlotSparseEmbeddingHash( + sparse_input.train_row_offsets, sparse_input.train_values, sparse_input.train_nnz, + sparse_input.evaluate_row_offsets, sparse_input.evaluate_values, + sparse_input.evaluate_nnz, embedding_params, plan_file, resource_manager)); + + break; + } + case Embedding_t::LocalizedSlotSparseEmbeddingOneHot: { + std::string plan_file = ""; + std::vector slot_size_array; + auto slots = get_json(j_hparam, "slot_size_array"); + assert(slots.is_array()); + for (auto slot : slots) { + slot_size_array.emplace_back(slot.get()); + } + + const SparseEmbeddingHashParams embedding_params = { + batch_size, + batch_size_eval, + 0, + slot_size_array, + embedding_vec_size, + sparse_input.max_feature_num_per_sample, + sparse_input.slot_num, + combiner, // combiner: 0-sum, 1-mean + embedding_opt_params}; + + embedding.emplace_back(new LocalizedSlotSparseEmbeddingOneHot( + sparse_input.train_row_offsets, sparse_input.train_values, sparse_input.train_nnz, + sparse_input.evaluate_row_offsets, sparse_input.evaluate_values, + sparse_input.evaluate_nnz, embedding_params, plan_file, resource_manager)); + + break; + } + } // switch + for (size_t i = 0; i < resource_manager->get_local_gpu_count(); i++) { + tensor_entries_list[i].push_back( + {top_name, TensorUse::Train, (embedding.back()->get_train_output_tensors())[i]}); + tensor_entries_list[i].push_back( + {top_name, TensorUse::Evaluate, (embedding.back()->get_evaluate_output_tensors())[i]}); + } +} + + +template +static void create_pipeline_internal(std::shared_ptr& data_reader, + std::shared_ptr& data_reader_eval, + std::vector>& embedding, + std::vector>& network, + const std::shared_ptr& resource_manager, + Parser& parser) { try { - const nlohmann::json& config = config_; - size_t batch_size = batch_size_; - size_t batch_size_eval = batch_size_eval_; - bool use_mixed_precision = use_mixed_precision_; - float scaler = scaler_; - bool use_algorithm_search = use_algorithm_search_; - bool use_cuda_graph = use_cuda_graph_; + nlohmann::json config = parser.config_; + size_t batch_size = parser.batch_size_; + size_t batch_size_eval = parser.batch_size_eval_; + bool use_mixed_precision = parser.use_mixed_precision_; + float scaler = parser.scaler_; + bool use_algorithm_search = parser.use_algorithm_search_; + bool use_cuda_graph = parser.use_cuda_graph_; std::map> sparse_input_map; std::vector tensor_entries_list[resource_manager->get_local_gpu_count()]; @@ -116,9 +1212,177 @@ void Parser::create_pipeline_internal(std::shared_ptr& data_reader, // Create Data Reader { const nlohmann::json& j = j_layers_array[0]; - create_datareader()(j, sparse_input_map, tensor_entries_list, data_reader, - data_reader_eval, batch_size, batch_size_, use_mixed_precision, - repeat_dataset_, resource_manager); + const auto layer_type_name = get_value_from_json(j, "type"); + if (layer_type_name.compare("Data") != 0) { + CK_THROW_(Error_t::WrongInput, "the first layer is not Data layer:" + layer_type_name); + } + + const std::map DATA_READER_MAP = { + {"Norm", DataReaderType_t::Norm}, + {"Raw", DataReaderType_t::Raw}, + {"Parquet", DataReaderType_t::Parquet}}; + + DataReaderType_t format = DataReaderType_t::Norm; + if (has_key_(j, "format")) { + const auto data_format_name = get_value_from_json(j, "format"); + if (!find_item_in_map(format, data_format_name, DATA_READER_MAP)) { + CK_THROW_(Error_t::WrongInput, "No such data format: " + data_format_name); + } + } + + auto cache_eval_data = get_value_from_json_soft(j, "cache_eval_data", 0); + + std::string source_data = get_value_from_json(j, "source"); + + auto j_label = get_json(j, "label"); + auto top_strs_label = get_value_from_json(j_label, "top"); + auto label_dim = get_value_from_json(j_label, "label_dim"); + + auto j_dense = get_json(j, "dense"); + auto top_strs_dense = get_value_from_json(j_dense, "top"); + auto dense_dim = get_value_from_json(j_dense, "dense_dim"); + + const std::map CHECK_TYPE_MAP = {{"Sum", Check_t::Sum}, + {"None", Check_t::None}}; + + Check_t check_type; + const auto check_str = get_value_from_json(j, "check"); + if (!find_item_in_map(check_type, check_str, CHECK_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "Not supported check type: " + check_str); + } + + std::vector data_reader_sparse_param_array; + + const std::map DATA_TYPE_MAP = { + {"DistributedSlot", DataReaderSparse_t::Distributed}, + {"LocalizedSlot", DataReaderSparse_t::Localized}, + }; + + auto j_sparse = get_json(j, "sparse"); + std::vector sparse_names; + + for (unsigned int i = 0; i < j_sparse.size(); i++) { + DataReaderSparseParam param; + + const nlohmann::json& js = j_sparse[i]; + const auto sparse_name = get_value_from_json(js, "top"); + const auto data_type_name = get_value_from_json(js, "type"); + if (!find_item_in_map(param.type, data_type_name, DATA_TYPE_MAP)) { + CK_THROW_(Error_t::WrongInput, "Not supported data type: " + data_type_name); + } + param.max_feature_num = get_value_from_json(js, "max_feature_num_per_sample"); + param.max_nnz = get_value_from_json_soft(js, "max_nnz", param.max_feature_num); + param.slot_num = get_value_from_json(js, "slot_num"); + data_reader_sparse_param_array.push_back(param); + SparseInput sparse_input(param.slot_num, param.max_feature_num); + sparse_input_map.emplace(sparse_name, sparse_input); + sparse_names.push_back(sparse_name); + } + + data_reader_eval = nullptr; + std::string eval_source; + FIND_AND_ASSIGN_STRING_KEY(eval_source, j); + +#ifdef VAL + const int NUM_THREADS = 1; +#else + const int NUM_THREADS = + format == DataReaderType_t::Parquet ? resource_manager->get_local_gpu_count() : 12; +#endif + + DataReader *data_reader_tk = new DataReader( + batch_size, label_dim, dense_dim, + data_reader_sparse_param_array, + resource_manager, + parser.repeat_dataset_, + NUM_THREADS, use_mixed_precision, false); + data_reader.reset(data_reader_tk); + DataReader *data_reader_eval_tk = new DataReader( + batch_size_eval, label_dim, dense_dim, + data_reader_sparse_param_array, + resource_manager, + parser.repeat_dataset_, + NUM_THREADS, use_mixed_precision, cache_eval_data); + data_reader_eval.reset(data_reader_eval_tk); + + auto f = [&j]() -> std::vector { + std::vector slot_offset; + if (has_key_(j, "slot_size_array")) { + auto slot_size_array = get_json(j, "slot_size_array"); + if (!slot_size_array.is_array()) { + CK_THROW_(Error_t::WrongInput, "!slot_size_array.is_array()"); + } + long long slot_sum = 0; + for (auto j_slot_size : slot_size_array) { + slot_offset.push_back(slot_sum); + long long slot_size = j_slot_size.get(); + slot_sum += slot_size; + } + MESSAGE_("Vocabulary size: " + std::to_string(slot_sum)); + } + return slot_offset; + }; + + switch (format) { + case DataReaderType_t::Norm: { + bool start_right_now = parser.repeat_dataset_; + data_reader->create_drwg_norm( + source_data, check_type, start_right_now); + data_reader_eval->create_drwg_norm( + eval_source, check_type, start_right_now); + break; + } + case DataReaderType_t::Raw: { + const auto num_samples = get_value_from_json(j, "num_samples"); + const auto eval_num_samples = get_value_from_json(j, "eval_num_samples"); + std::vector slot_offset = f(); + bool float_label_dense = get_value_from_json_soft(j, "float_label_dense", false); + data_reader->create_drwg_raw(source_data, num_samples, slot_offset, float_label_dense, + true, false); + data_reader_eval->create_drwg_raw(eval_source, eval_num_samples, slot_offset, + float_label_dense, false, false); + + break; + } + case DataReaderType_t::Parquet: { + // @Future: Should be slot_offset here and data_reader ctor should + // be TypeKey not long long + std::vector slot_offset = f(); + data_reader->create_drwg_parquet(source_data, slot_offset, true); + data_reader_eval->create_drwg_parquet(eval_source, slot_offset, true); + break; + } + default: { assert(!"Error: no such option && should never get here!"); } + } + + for (size_t i = 0; i < resource_manager->get_local_gpu_count(); i++) { + tensor_entries_list[i].push_back( + {top_strs_label, TensorUse::Train, data_reader_tk->get_label_tensors()[i].shrink()}); + tensor_entries_list[i].push_back({top_strs_label, TensorUse::Evaluate, + data_reader_eval_tk->get_label_tensors()[i].shrink()}); + + if (use_mixed_precision) { + tensor_entries_list[i].push_back( + {top_strs_dense, TensorUse::Train, data_reader_tk->get_dense_tensors()[i]}); + tensor_entries_list[i].push_back( + {top_strs_dense, TensorUse::Evaluate, data_reader_eval_tk->get_dense_tensors()[i]}); + } else { + tensor_entries_list[i].push_back( + {top_strs_dense, TensorUse::Train, data_reader_tk->get_dense_tensors()[i]}); + tensor_entries_list[i].push_back( + {top_strs_dense, TensorUse::Evaluate, data_reader_eval_tk->get_dense_tensors()[i]}); + } + } + + for (unsigned int i = 0; i < j_sparse.size(); i++) { + const auto& sparse_input = sparse_input_map.find(sparse_names[i]); + sparse_input->second.train_row_offsets = data_reader_tk->get_row_offsets_tensors(i); + sparse_input->second.train_values = data_reader_tk->get_value_tensors(i); + sparse_input->second.train_nnz = data_reader_tk->get_nnz_array(i); + sparse_input->second.evaluate_row_offsets = data_reader_eval_tk->get_row_offsets_tensors(i); + sparse_input->second.evaluate_values = data_reader_eval_tk->get_value_tensors(i); + sparse_input->second.evaluate_nnz = data_reader_eval_tk->get_nnz_array(i); + } } // Create Embedding @@ -138,13 +1402,13 @@ void Parser::create_pipeline_internal(std::shared_ptr& data_reader, } if (use_mixed_precision) { - create_embedding()( - sparse_input_map, tensor_entries_list, embedding, embedding_type, config, - resource_manager, batch_size, batch_size_eval, use_mixed_precision, scaler, j); + create_embedding(sparse_input_map, tensor_entries_list, embedding, + embedding_type, config, resource_manager, batch_size, + batch_size_eval, use_mixed_precision, scaler, j); } else { - create_embedding()(sparse_input_map, tensor_entries_list, embedding, - embedding_type, config, resource_manager, batch_size, - batch_size_eval, use_mixed_precision, scaler, j); + create_embedding(sparse_input_map, tensor_entries_list, embedding, + embedding_type, config, resource_manager, batch_size, + batch_size_eval, use_mixed_precision, scaler, j); } } // for () } // Create Embedding @@ -155,10 +1419,10 @@ void Parser::create_pipeline_internal(std::shared_ptr& data_reader, CK_THROW_(Error_t::WrongInput, "0 != batch_size\%total_gpu_count"); } for (size_t i = 0; i < resource_manager->get_local_gpu_count(); i++) { - network.emplace_back(Network::create_network( - j_layers_array, j_optimizer, tensor_entries_list[i], total_gpu_count, - resource_manager->get_local_cpu(), resource_manager->get_local_gpu(i), - use_mixed_precision, scaler, use_algorithm_search, use_cuda_graph, false)); + network.emplace_back(create_network(j_layers_array, j_optimizer, tensor_entries_list[i], + total_gpu_count, resource_manager->get_local_cpu(), + resource_manager->get_local_gpu(i), use_mixed_precision, + scaler, use_algorithm_search, use_cuda_graph)); } } @@ -166,8 +1430,10 @@ void Parser::create_pipeline_internal(std::shared_ptr& data_reader, std::cerr << rt_err.what() << std::endl; throw; } + } + void Parser::create_pipeline(std::shared_ptr& data_reader, std::shared_ptr& data_reader_eval, std::vector>& embedding, @@ -175,60 +1441,12 @@ void Parser::create_pipeline(std::shared_ptr& data_reader, const std::shared_ptr& resource_manager) { if (i64_input_key_) { create_pipeline_internal(data_reader, data_reader_eval, embedding, network, - resource_manager); + resource_manager, *this); } else { create_pipeline_internal(data_reader, data_reader_eval, embedding, network, - resource_manager); + resource_manager, *this); } } -template -void Parser::create_pipeline_inference(const InferenceParser& inference_parser, Tensor2& dense_input, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embeddings, - Network** network, - const std::shared_ptr resource_manager) { - //std::vector tensor_entries; - - auto j_layers_array = get_json(config_, "layers"); - - auto input_buffer = GeneralBuffer2::create(); - - { - const nlohmann::json& j_data = j_layers_array[0]; - auto j_dense = get_json(j_data, "dense"); - auto top_strs_dense = get_value_from_json(j_dense, "top"); - auto dense_dim = get_value_from_json(j_dense, "dense_dim"); - - input_buffer->reserve({inference_parser.max_batchsize, dense_dim}, &dense_input); - tensor_entries.push_back({top_strs_dense, TensorUse::General, dense_input.shrink()}); - } - - create_embedding()(inference_parser, j_layers_array, rows, embeddingvecs, embedding_table_slot_size, &tensor_entries, - embeddings, resource_manager->get_local_gpu(0), input_buffer); - input_buffer->allocate(); - - //create network - *network = Network::create_network( - j_layers_array, "", tensor_entries, 1, resource_manager->get_local_cpu(), - resource_manager->get_local_gpu(0), inference_parser.use_mixed_precision, inference_parser.scaler, false, inference_parser.use_cuda_graph, true); -} - -void Parser::create_pipeline(const InferenceParser& inference_parser, Tensor2& dense_input, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embeddings, Network** network, - const std::shared_ptr resource_manager) { - if (inference_parser.use_mixed_precision) { - create_pipeline_inference<__half>(inference_parser, dense_input, rows, embeddingvecs, embedding_table_slot_size, embeddings, network, - resource_manager); - } else { - create_pipeline_inference(inference_parser, dense_input, rows, embeddingvecs, embedding_table_slot_size, embeddings, network, - resource_manager); - } -} } // namespace HugeCTR