diff --git a/HugeCTR/include/inference/embedding_interface.hpp b/HugeCTR/include/inference/embedding_interface.hpp index c1dda191f8..8f42cfd262 100644 --- a/HugeCTR/include/inference/embedding_interface.hpp +++ b/HugeCTR/include/inference/embedding_interface.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include namespace HugeCTR { diff --git a/HugeCTR/include/parser.hpp b/HugeCTR/include/parser.hpp index 5a28f6a5d0..006cc874d7 100644 --- a/HugeCTR/include/parser.hpp +++ b/HugeCTR/include/parser.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,45 @@ namespace HugeCTR { +nlohmann::json read_json_file(const std::string& filename); + +struct SolverParser { + // std::string configure_file; + unsigned long long seed; /**< seed of data simulator */ + LrPolicy_t lr_policy; /**< the only fixed lr is supported now. */ + int display; /**< the interval of loss display. */ + int max_iter; /**< the number of iterations for training */ + int num_epochs; /**< the number of epochs for training */ + int snapshot; /**< the number of iterations for a snapshot */ + std::string snapshot_prefix; /**< naming prefix of snapshot file */ + int eval_interval; /**< the interval of evaluations */ + int eval_batches; /**< the number of batches for evaluations */ + int batchsize_eval; /**< batchsize for eval */ + int batchsize; /**< batchsize */ + std::string model_file; /**< name of model file */ + std::vector embedding_files; /**< name of embedding file */ + std::vector> vvgpu; /**< device map */ + bool use_mixed_precision; + float scaler; + std::map metrics_spec; + bool i64_input_key; + bool use_algorithm_search; + bool use_cuda_graph; + SolverParser(const std::string& file); + SolverParser() {} +}; +struct InferenceParser { + // std::string configure_file; + size_t max_batchsize; /**< batchsize */ + std::string dense_model_file; /**< name of model file */ + std::vector sparse_model_files; /**< name of embedding file */ + bool use_mixed_precision; + float scaler; + bool use_algorithm_search; + bool use_cuda_graph; + InferenceParser(const nlohmann::json& config); +}; + /** * @brief The parser of configure file (in json format). * @@ -51,42 +91,37 @@ class Parser { const bool use_algorithm_search_; const bool use_cuda_graph_; + template + void create_pipeline_internal(std::shared_ptr& data_reader, + std::shared_ptr& data_reader_eval, + std::vector>& embedding, + std::vector>& network, + const std::shared_ptr& resource_manager); + + template + void create_pipeline_inference(const InferenceParser& inference_parser, + Tensor2& dense_input, + std::vector>>& rows, + std::vector>>& embeddingvecs, + std::vector& embedding_table_slot_size, + std::vector>* embedding, Network** network, + const std::shared_ptr resource_manager); + public: + std::vector tensor_entries; /** * Ctor. * Ctor only verify the configure file, doesn't create pipeline. */ + Parser(const std::string& configure_file, size_t batch_size, size_t batch_size_eval, + bool repeat_dataset, bool i64_input_key = false, bool use_mixed_precision = false, + float scaler = 1.0f, bool use_algorithm_search = true, bool use_cuda_graph = true); - Parser(const std::string& configure_file, - size_t batch_size, - size_t batch_size_eval, - bool repeat_dataset, - bool i64_input_key = false, - bool use_mixed_precision = false, - float scaler = 1.0f, - bool use_algorithm_search = true, - bool use_cuda_graph = true) - : batch_size_(batch_size), - batch_size_eval_(batch_size_eval), - repeat_dataset_(repeat_dataset), - i64_input_key_(i64_input_key), - use_mixed_precision_(use_mixed_precision), - scaler_(scaler), - use_algorithm_search_(use_algorithm_search), - use_cuda_graph_(use_cuda_graph) { - try { - std::ifstream file(configure_file); - if (!file.is_open()) { - CK_THROW_(Error_t::FileCannotOpen, "file.is_open() failed: " + configure_file); - } - file >> config_; - file.close(); - } catch (const std::runtime_error& rt_err) { - std::cerr << rt_err.what() << std::endl; - throw; - } - return; - } + /** + * Ctor. + * Ctor used in inference stage + */ + Parser(const nlohmann::json& config); /** * Create the pipeline, which includes data reader, embedding. @@ -97,14 +132,15 @@ class Parser { std::vector>& network, const std::shared_ptr& resource_manager); - template - friend void create_pipeline_internal(std::shared_ptr& data_reader, - std::shared_ptr& data_reader_eval, - std::vector>& embedding, - std::vector>& network, - const std::shared_ptr& resource_manager, - Parser& parser); - + /** + * Create inference pipeline, which only creates network and embedding + */ + void create_pipeline(const InferenceParser& inference_parser, Tensor2& dense_input, + std::vector>>& row, + std::vector>>& embeddingvec, + std::vector& embedding_table_slot_size, + std::vector>* embedding, Network** network, + const std::shared_ptr resource_manager); }; std::unique_ptr get_learning_rate_scheduler( @@ -114,32 +150,6 @@ std::unique_ptr get_learning_rate_scheduler( * Solver Parser. * This class is designed to parse the solver clause of the configure file. */ -struct SolverParser { - // std::string configure_file; - unsigned long long seed; /**< seed of data simulator */ - LrPolicy_t lr_policy; /**< the only fixed lr is supported now. */ - int display; /**< the interval of loss display. */ - int max_iter; /**< the number of iterations for training */ - int num_epochs; /**< the number of epochs for training */ - int snapshot; /**< the number of iterations for a snapshot */ - std::string snapshot_prefix; /**< naming prefix of snapshot file */ - int eval_interval; /**< the interval of evaluations */ - int eval_batches; /**< the number of batches for evaluations */ - int batchsize_eval; /**< batchsize for eval */ - int batchsize; /**< batchsize */ - std::string model_file; /**< name of model file */ - std::vector embedding_files; /**< name of embedding file */ - std::vector> vvgpu; /**< device map */ - bool use_mixed_precision; - float scaler; - std::map metrics_spec; - bool i64_input_key; - bool use_algorithm_search; - bool use_cuda_graph; - SolverParser(const std::string& file); - SolverParser(){} -}; - template struct SparseInput { @@ -186,6 +196,49 @@ struct SparseInput { } \ } while (0) +const std::map LAYER_TYPE_MAP = { + {"BatchNorm", Layer_t::BatchNorm}, + {"BinaryCrossEntropyLoss", Layer_t::BinaryCrossEntropyLoss}, + {"Concat", Layer_t::Concat}, + {"CrossEntropyLoss", Layer_t::CrossEntropyLoss}, + {"Dropout", Layer_t::Dropout}, + {"ELU", Layer_t::ELU}, + {"InnerProduct", Layer_t::InnerProduct}, + {"Interaction", Layer_t::Interaction}, + {"MultiCrossEntropyLoss", Layer_t::MultiCrossEntropyLoss}, + {"ReLU", Layer_t::ReLU}, + {"Reshape", Layer_t::Reshape}, + {"Sigmoid", Layer_t::Sigmoid}, + {"Slice", Layer_t::Slice}, + {"Multiply", Layer_t::Multiply}, + {"FmOrder2", Layer_t::FmOrder2}, + {"Add", Layer_t::Add}, + {"ReduceSum", Layer_t::ReduceSum}, + {"MultiCross", Layer_t::MultiCross}, + {"DotProduct", Layer_t::DotProduct}}; +const std::map LAYER_TYPE_MAP_MP = { + {"BinaryCrossEntropyLoss", Layer_t::BinaryCrossEntropyLoss}, + {"Concat", Layer_t::Concat}, + {"Cast", Layer_t::Cast}, + {"InnerProduct", Layer_t::InnerProduct}, + {"FusedInnerProduct", Layer_t::FusedInnerProduct}, + {"Interaction", Layer_t::Interaction}, + {"Reshape", Layer_t::Reshape}, + {"Sigmoid", Layer_t::Sigmoid}, + {"Slice", Layer_t::Slice}, + {"ReLU", Layer_t::ReLU}, + {"Dropout", Layer_t::Dropout}, + {"Add", Layer_t::Add}}; +const std::map EMBEDDING_TYPE_MAP = { + {"DistributedSlotSparseEmbeddingHash", Embedding_t::DistributedSlotSparseEmbeddingHash}, + {"LocalizedSlotSparseEmbeddingHash", Embedding_t::LocalizedSlotSparseEmbeddingHash}, + {"LocalizedSlotSparseEmbeddingOneHot", Embedding_t::LocalizedSlotSparseEmbeddingOneHot}}; +const std::map INITIALIZER_TYPE_MAP = { + {"Uniform", Initializer_t::Uniform}, + {"XavierNorm", Initializer_t::XavierNorm}, + {"XavierUniform", Initializer_t::XavierUniform}, + {"Zero", Initializer_t::Zero}}; + static const std::map OPTIMIZER_TYPE_MAP = { {"Adam", Optimizer_t::Adam}, {"MomentumSGD", Optimizer_t::MomentumSGD}, @@ -193,9 +246,7 @@ static const std::map OPTIMIZER_TYPE_MAP = { {"SGD", Optimizer_t::SGD}}; static const std::map UPDATE_TYPE_MAP = { - {"Local", Update_t::Local}, - {"Global", Update_t::Global}, - {"LazyGlobal", Update_t::LazyGlobal}}; + {"Local", Update_t::Local}, {"Global", Update_t::Global}, {"LazyGlobal", Update_t::LazyGlobal}}; static const std::map REGULARIZER_TYPE_MAP = { {"L1", Regularizer_t::L1}, @@ -235,11 +286,40 @@ inline T get_value_from_json_soft(const nlohmann::json& json, const std::string } } -void parse_data_layer_helper(const nlohmann::json& j, int& label_dim, int& dense_dim, - Check_t& check_type, std::string& source_data, - std::vector& data_reader_sparse_param_array, - std::string& eval_source, std::string& top_strs_label, - std::string& top_strs_dense, std::vector& sparse_names, - std::map>& sparse_input_map); +template +struct get_optimizer_param { + OptParams operator()(const nlohmann::json& j_optimizer); +}; + +template +struct create_embedding { + void operator()(std::map>& sparse_input_map, + std::vector* tensor_entries_list, + std::vector>& embedding, Embedding_t embedding_type, + const nlohmann::json& config, + const std::shared_ptr& resource_manager, size_t batch_size, + size_t batch_size_eval, bool use_mixed_precision, float scaler, + const nlohmann::json& j_layers); + + void operator()(const InferenceParser& inference_parser, const nlohmann::json& j_layers_array, + std::vector>>& rows, + std::vector>>& embeddingvecs, + std::vector& embedding_table_slot_size, + std::vector* tensor_entries, + std::vector>* embeddings, + const std::shared_ptr gpu_resource, + std::shared_ptr>& blobs_buff); +}; + +template +struct create_datareader { + void operator()(const nlohmann::json& j, + std::map>& sparse_input_map, + std::vector* tensor_entries_list, + std::shared_ptr& data_reader, + std::shared_ptr& data_reader_eval, size_t batch_size, + size_t batch_size_eval, bool use_mixed_precision, bool repeat_dataset, + const std::shared_ptr resource_manager); +}; } // namespace HugeCTR diff --git a/HugeCTR/src/inference/embedding_interface.cpp b/HugeCTR/src/inference/embedding_interface.cpp index b500ed6d7b..2bc3fe0d97 100644 --- a/HugeCTR/src/inference/embedding_interface.cpp +++ b/HugeCTR/src/inference/embedding_interface.cpp @@ -15,6 +15,7 @@ */ #include +#include namespace HugeCTR{ @@ -39,13 +40,13 @@ embedding_interface* embedding_interface::Create_Embedding_Cache(HugectrUtility< return embedding_cache; } -template embedding_interface* embedding_interface::Create_Embedding_Cache(HugectrUtility* +template embedding_interface* embedding_interface::Create_Embedding_Cache(HugectrUtility*, int, bool, float, const std::string&, const std::string&); -template embedding_interface* embedding_interface::Create_Embedding_Cache(HugectrUtility* +template embedding_interface* embedding_interface::Create_Embedding_Cache(HugectrUtility*, int, bool, float, diff --git a/HugeCTR/src/inference/gpu_cache/nv_gpu_cache.cu b/HugeCTR/src/inference/gpu_cache/nv_gpu_cache.cu index 5feb7872da..4a57398a24 100644 --- a/HugeCTR/src/inference/gpu_cache/nv_gpu_cache.cu +++ b/HugeCTR/src/inference/gpu_cache/nv_gpu_cache.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "" +#include // Overload CUDA atomic for other 64bit unsinged/signed integer type __forceinline__ diff --git a/HugeCTR/src/inference/inference_utilis.cpp b/HugeCTR/src/inference/inference_utilis.cpp index 0f5e5c41e9..03f1b98175 100644 --- a/HugeCTR/src/inference/inference_utilis.cpp +++ b/HugeCTR/src/inference/inference_utilis.cpp @@ -15,6 +15,7 @@ */ #include +#include namespace HugeCTR { template diff --git a/HugeCTR/src/inference/parameter_server.cpp b/HugeCTR/src/inference/parameter_server.cpp index 71dc289c32..af2d97b935 100644 --- a/HugeCTR/src/inference/parameter_server.cpp +++ b/HugeCTR/src/inference/parameter_server.cpp @@ -84,7 +84,7 @@ parameter_server::parameter_server(const std::string& framework_nam if(ps_config_.distributed_emb_.size() != model_config_path.size() || ps_config_.embedding_vec_size_.size() != model_config_path.size() || - ps_config_.default_emb_vec_value_.size() != model_config_path.size())){ + ps_config_.default_emb_vec_value_.size() != model_config_path.size()){ CK_THROW_(Error_t::WrongInput, "Wrong input: The size of parameter server parameters are not correct."); } @@ -139,7 +139,7 @@ parameter_server::parameter_server(const std::string& framework_nam for(size_t pair = 0; pair < row_num; pair++){ // Read out the emb_id, slot_id and emb_vec emb_file.read(reinterpret_cast(&read_key), sizeof(TypeHashKey)); - emb_file.read(reinterpret_cast(&slod_id), sizeof(size_t)); + emb_file.read(reinterpret_cast(&read_slod_id), sizeof(size_t)); emb_file.read(reinterpret_cast(read_emb_vec.data()), sizeof(float) * ps_config_.embedding_vec_size_[i][j]); diff --git a/HugeCTR/src/parser.cpp b/HugeCTR/src/parser.cpp index aa5015bcc7..6be1b4dc9c 100644 --- a/HugeCTR/src/parser.cpp +++ b/HugeCTR/src/parser.cpp @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -52,1152 +53,55 @@ namespace HugeCTR { -struct InputOutputInfo { - std::vector train_input; - std::vector evaluate_input; - std::vector output; -}; - -static bool get_tensor_from_entries(const std::vector tensor_entries, - const std::string& name, TensorUse use, TensorBag2* bag) { - if (use == TensorUse::General) { - CK_THROW_(Error_t::WrongInput, "Type should not be general"); - } - for (const TensorEntry& entry : tensor_entries) { - if (entry.name == name && (entry.use == TensorUse::General || entry.use == use)) { - *bag = entry.bag; - return true; - } - } - return false; -} - -static std::vector get_layer_names(const nlohmann::json& json) { - std::vector layer_names; - if (json.is_array()) { - for (auto j : json) { - layer_names.push_back(j.get()); - } - } else { - layer_names.push_back(json.get()); - } - - return layer_names; -} - -static InputOutputInfo get_input_tensor_and_output_name( - const nlohmann::json& json, const std::vector& tensor_entries) { - auto bottom = get_json(json, "bottom"); - std::vector bottom_strs = get_layer_names(bottom); - - auto top = get_json(json, "top"); - std::vector top_strs = get_layer_names(top); - - std::vector bottom_train_tensors; - std::vector bottom_evaluate_tensors; - - for (auto& bstr : bottom_strs) { - for (auto& tstr : top_strs) { - if (bstr == tstr) { - CK_THROW_(Error_t::WrongInput, "bottom and top include a same layer name"); - } - } - TensorBag2 tensor; - if (!get_tensor_from_entries(tensor_entries, bstr, TensorUse::Train, &tensor)) { - CK_THROW_(Error_t::WrongInput, "No such bottom: " + bstr); - } - bottom_train_tensors.push_back(tensor); - if (!get_tensor_from_entries(tensor_entries, bstr, TensorUse::Evaluate, &tensor)) { - CK_THROW_(Error_t::WrongInput, "No such bottom: " + bstr); - } - bottom_evaluate_tensors.push_back(tensor); +nlohmann::json read_json_file(const std::string& filename) { + nlohmann::json config; + std::ifstream file_stream(filename); + if (!file_stream.is_open()) { + CK_THROW_(Error_t::FileCannotOpen, "file_stream.is_open() failed: " + filename); } - return {bottom_train_tensors, bottom_evaluate_tensors, top_strs}; -} - -struct TensorPair { - TensorBag2 tensor; - std::string name; -}; - -static void add_tensor_to_network(TensorPair& output_tensor_pair, - std::vector& tensor_entries) { - tensor_entries.push_back( - {output_tensor_pair.name, TensorUse::General, output_tensor_pair.tensor}); + file_stream >> config; + file_stream.close(); + return config; } -template -static OptParams get_optimizer_param(const nlohmann::json& j_optimizer) { - // create optimizer - auto optimizer_name = get_value_from_json(j_optimizer, "type"); - Optimizer_t optimizer_type; - if (!find_item_in_map(optimizer_type, optimizer_name, OPTIMIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such optimizer: " + optimizer_name); - } - - OptHyperParams opt_hyper_params; - memset(&opt_hyper_params, 0, sizeof(opt_hyper_params)); - OptParams opt_params; - - Update_t update_type = Update_t::Local; - if (has_key_(j_optimizer, "update_type")) { - std::string update_name = get_value_from_json(j_optimizer, "update_type"); - if (!find_item_in_map(update_type, update_name, UPDATE_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such update type: " + update_name); - } - } else if (has_key_(j_optimizer, "global_update")) { - bool global_update = get_value_from_json(j_optimizer, "global_update"); - if (global_update) update_type = Update_t::Global; - } else { - MESSAGE_("update_type is not specified, using default: local"); - } - - switch (optimizer_type) { - case Optimizer_t::Adam: { - auto j_hparam = get_json(j_optimizer, "adam_hparam"); - float learning_rate = get_value_from_json(j_hparam, "learning_rate"); - float beta1 = get_value_from_json(j_hparam, "beta1"); - float beta2 = get_value_from_json(j_hparam, "beta2"); - float epsilon = get_value_from_json(j_hparam, "epsilon"); - opt_hyper_params.adam.beta1 = beta1; - opt_hyper_params.adam.beta2 = beta2; - opt_hyper_params.adam.epsilon = epsilon; - opt_params = {Optimizer_t::Adam, learning_rate, opt_hyper_params, update_type}; - break; - } - case Optimizer_t::MomentumSGD: { - auto j_hparam = get_json(j_optimizer, "momentum_sgd_hparam"); - float learning_rate = get_value_from_json(j_hparam, "learning_rate"); - float momentum_factor = get_value_from_json(j_hparam, "momentum_factor"); - opt_hyper_params.momentum.factor = momentum_factor; - opt_params = {Optimizer_t::MomentumSGD, learning_rate, opt_hyper_params, update_type}; - break; - } - case Optimizer_t::Nesterov: { - auto j_hparam = get_json(j_optimizer, "nesterov_hparam"); - float learning_rate = get_value_from_json(j_hparam, "learning_rate"); - float momentum_factor = get_value_from_json(j_hparam, "momentum_factor"); - opt_hyper_params.nesterov.mu = momentum_factor; - opt_params = {Optimizer_t::Nesterov, learning_rate, opt_hyper_params, update_type}; - break; - } - case Optimizer_t::SGD: { - auto j_hparam = get_json(j_optimizer, "sgd_hparam"); - auto learning_rate = get_value_from_json(j_hparam, "learning_rate"); - if (has_key_(j_hparam, "atomic_update")) { - opt_hyper_params.sgd.atomic_update = get_value_from_json(j_hparam, "atomic_update"); - } - opt_params = {Optimizer_t::SGD, learning_rate, opt_hyper_params, update_type}; - break; - } - default: - assert(!"Error: no such optimizer && should never get here!"); - } - return opt_params; -} - -template -static std::shared_ptr> create_regularizer( - const nlohmann::json& j, const Tensor2& weight_buff, const Tensor2& wgrad_buff, - const int batch_size, const std::shared_ptr& gpu_resource) { - std::shared_ptr> reg( - new NoRegularizer(weight_buff, wgrad_buff, batch_size, gpu_resource)); - auto reg_it = j.find("regularizer"); - if (reg_it != j.end()) { - Regularizer_t reg_type; - auto reg_name = reg_it->get(); - if (!find_item_in_map(reg_type, reg_name, REGULARIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such regularizer: " + reg_name); - } - switch (reg_type) { - case Regularizer_t::L1: { - const auto lambda = get_value_from_json(j, "lambda"); - reg.reset(new L1Regularizer(weight_buff, wgrad_buff, batch_size, lambda, gpu_resource)); - break; - } - case Regularizer_t::L2: { - const auto lambda = get_value_from_json(j, "lambda"); - reg.reset(new L2Regularizer(weight_buff, wgrad_buff, batch_size, lambda, gpu_resource)); - break; - } - default: { assert(!"Error: no such regularizer!"); } - } - } - return reg; -} - -const std::map LAYER_TYPE_MAP = { - {"BatchNorm", Layer_t::BatchNorm}, - {"BinaryCrossEntropyLoss", Layer_t::BinaryCrossEntropyLoss}, - {"Concat", Layer_t::Concat}, - {"CrossEntropyLoss", Layer_t::CrossEntropyLoss}, - {"Dropout", Layer_t::Dropout}, - {"ELU", Layer_t::ELU}, - {"InnerProduct", Layer_t::InnerProduct}, - {"Interaction", Layer_t::Interaction}, - {"MultiCrossEntropyLoss", Layer_t::MultiCrossEntropyLoss}, - {"ReLU", Layer_t::ReLU}, - {"Reshape", Layer_t::Reshape}, - {"Slice", Layer_t::Slice}, - {"Multiply", Layer_t::Multiply}, - {"FmOrder2", Layer_t::FmOrder2}, - {"Add", Layer_t::Add}, - {"ReduceSum", Layer_t::ReduceSum}, - {"MultiCross", Layer_t::MultiCross}, - {"DotProduct", Layer_t::DotProduct}}; -const std::map LAYER_TYPE_MAP_MP = { - {"BinaryCrossEntropyLoss", Layer_t::BinaryCrossEntropyLoss}, - {"Concat", Layer_t::Concat}, - {"Cast", Layer_t::Cast}, - {"InnerProduct", Layer_t::InnerProduct}, - {"FusedInnerProduct", Layer_t::FusedInnerProduct}, - {"Interaction", Layer_t::Interaction}, - {"Reshape", Layer_t::Reshape}, - {"Slice", Layer_t::Slice}, - {"ReLU", Layer_t::ReLU}, - {"Dropout", Layer_t::Dropout}, - {"Add", Layer_t::Add}}; -const std::map EMBEDDING_TYPE_MAP = { - {"DistributedSlotSparseEmbeddingHash", Embedding_t::DistributedSlotSparseEmbeddingHash}, - {"LocalizedSlotSparseEmbeddingHash", Embedding_t::LocalizedSlotSparseEmbeddingHash}, - {"LocalizedSlotSparseEmbeddingOneHot", Embedding_t::LocalizedSlotSparseEmbeddingOneHot}}; -const std::map INITIALIZER_TYPE_MAP = { - {"Uniform", Initializer_t::Uniform}, - {"XavierNorm", Initializer_t::XavierNorm}, - {"XavierUniform", Initializer_t::XavierUniform}, - {"Zero", Initializer_t::Zero}}; - -/* - * Create single network - * - */ -Network* create_network(const nlohmann::json& j_array, const nlohmann::json& j_optimizer, - std::vector& tensor_entries, int num_networks_in_global, - const std::shared_ptr& cpu_resource, - const std::shared_ptr& gpu_resource, bool use_mixed_precision, - float scaler, bool use_algorithm_search, bool use_cuda_graph) { - std::unique_ptr network( - new Network(cpu_resource, gpu_resource, use_mixed_precision, use_cuda_graph)); - - auto& layers = network->layers_; - auto& loss_tensor = network->loss_tensor_; - auto& loss = network->loss_; - - std::shared_ptr> blobs_buff = - GeneralBuffer2::create(); - - std::shared_ptr> weight_buff = blobs_buff->create_block(); - std::shared_ptr> weight_buff_half = blobs_buff->create_block<__half>(); - std::shared_ptr> wgrad_buff = blobs_buff->create_block(); - std::shared_ptr> wgrad_buff_half = blobs_buff->create_block<__half>(); - - assert(layers.empty()); - - for (unsigned int i = 1; i < j_array.size(); i++) { - const nlohmann::json& j = j_array[i]; - const auto layer_type_name = get_value_from_json(j, "type"); - Layer_t layer_type; - - const auto& layer_map = use_mixed_precision ? LAYER_TYPE_MAP_MP : LAYER_TYPE_MAP; - - if (!find_item_in_map(layer_type, layer_type_name, layer_map)) { - Embedding_t embedding_type; - if (!find_item_in_map(embedding_type, layer_type_name, EMBEDDING_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such layer: " + layer_type_name); - } - continue; - } - - std::vector output_tensor_pairs; - auto input_output_info = get_input_tensor_and_output_name(j, tensor_entries); - switch (layer_type) { - case Layer_t::BatchNorm: { - Tensor2 bn_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - // establish out tensor - Tensor2 bn_out_tensor; - blobs_buff->reserve(bn_in_tensor.get_dimensions(), &bn_out_tensor); - output_tensor_pairs.push_back({bn_out_tensor.shrink(), input_output_info.output[0]}); - - // get BN params - auto j_bn_hparam = get_json(j, "bn_param"); - auto factor = get_value_from_json(j_bn_hparam, "factor"); - auto eps = get_value_from_json(j_bn_hparam, "eps"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_bn_hparam, "gamma_init")) { - const auto gamma_init_name = get_value_from_json(j_bn_hparam, "gamma_init"); - Initializer_t gamma_init_type; - if (!find_item_in_map(gamma_init_type, gamma_init_name, INITIALIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such initializer: " + gamma_init_name); - } else { - initializer_types[0] = gamma_init_type; - } - } - if (has_key_(j_bn_hparam, "beta_init")) { - const auto beta_init_name = get_value_from_json(j_bn_hparam, "beta_init"); - Initializer_t beta_init_type; - if (!find_item_in_map(beta_init_type, beta_init_name, INITIALIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such initializer: " + beta_init_name); - } else { - initializer_types[1] = beta_init_type; - } - } - - BatchNormLayer::Params params = {factor, eps}; - layers.emplace_back(new BatchNormLayer(weight_buff, wgrad_buff, blobs_buff, bn_in_tensor, - bn_out_tensor, params, gpu_resource, - initializer_types)); - break; - } - case Layer_t::BinaryCrossEntropyLoss: { - if (input_output_info.train_input.size() != 2 || - input_output_info.evaluate_input.size() != 2) { - CK_THROW_(Error_t::WrongInput, "bottom of BinaryCrossEntropyLoss must be two dim"); - } - Tensor2 train_label_tensor = - Tensor2::stretch_from(input_output_info.train_input[1]); - Tensor2 evaluate_label_tensor = - Tensor2::stretch_from(input_output_info.evaluate_input[1]); - blobs_buff->reserve({1, 1}, &loss_tensor); - if (use_mixed_precision) { - Tensor2<__half> train_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - Tensor2<__half> evaluate_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); - - loss.reset(new BinaryCrossEntropyLoss<__half>( - train_label_tensor, train_in_tensor, evaluate_label_tensor, evaluate_in_tensor, - loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff_half->as_tensor(), - train_in_tensor.get_dimensions()[0], gpu_resource), - gpu_resource, num_networks_in_global, scaler)); - } else { - Tensor2 train_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 evaluate_in_tensor = - Tensor2::stretch_from(input_output_info.evaluate_input[0]); - - loss.reset(new BinaryCrossEntropyLoss( - train_label_tensor, train_in_tensor, evaluate_label_tensor, evaluate_in_tensor, - loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff->as_tensor(), - train_in_tensor.get_dimensions()[0], gpu_resource), - gpu_resource, num_networks_in_global, scaler)); - } - break; - } - case Layer_t::Concat: { - if (use_mixed_precision) { - Tensors2<__half> train_in_tensors; - for (const TensorBag2& t : input_output_info.train_input) { - train_in_tensors.push_back(Tensor2<__half>::stretch_from(t)); - } - Tensors2<__half> evaluate_in_tensors; - for (const TensorBag2& t : input_output_info.evaluate_input) { - evaluate_in_tensors.push_back(Tensor2<__half>::stretch_from(t)); - } - Tensor2<__half> out_tensor; - layers.emplace_back(new ConcatLayer<__half>(train_in_tensors, evaluate_in_tensors, - out_tensor, blobs_buff, gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - } else { - Tensors2 train_in_tensors; - for (const TensorBag2& t : input_output_info.train_input) { - train_in_tensors.push_back(Tensor2::stretch_from(t)); - } - Tensors2 evaluate_in_tensors; - for (const TensorBag2& t : input_output_info.evaluate_input) { - evaluate_in_tensors.push_back(Tensor2::stretch_from(t)); - } - Tensor2 out_tensor; - layers.emplace_back(new ConcatLayer(train_in_tensors, evaluate_in_tensors, - out_tensor, blobs_buff, gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - } - break; - } - case Layer_t::CrossEntropyLoss: { - if (input_output_info.train_input.size() != 2) { - CK_THROW_(Error_t::WrongInput, "bottom of CrossEntropyLoss must be two dim"); - } - Tensor2 label_tensor = - Tensor2::stretch_from(input_output_info.train_input[1]); - blobs_buff->reserve({1, 1}, &loss_tensor); - if (use_mixed_precision) { - Tensor2<__half> cross_entropy_loss_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - - loss.reset(new CrossEntropyLoss<__half>( - label_tensor, cross_entropy_loss_in_tensor, loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff_half->as_tensor(), - cross_entropy_loss_in_tensor.get_dimensions()[0], gpu_resource), - gpu_resource, num_networks_in_global, scaler)); - } else { - Tensor2 cross_entropy_loss_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - - loss.reset(new CrossEntropyLoss( - label_tensor, cross_entropy_loss_in_tensor, loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff->as_tensor(), - cross_entropy_loss_in_tensor.get_dimensions()[0], gpu_resource), - gpu_resource, num_networks_in_global, scaler)); - } - break; - } - case Layer_t::Dropout: { - if (use_mixed_precision) { - Tensor2<__half> do_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - // establish out tensor - Tensor2<__half> do_out_tensor; - blobs_buff->reserve(do_in_tensor.get_dimensions(), &do_out_tensor); - output_tensor_pairs.push_back({do_out_tensor.shrink(), input_output_info.output[0]}); - // get ELU params - auto rate_it = j.find("rate"); - auto rate = (rate_it != j.end()) ? rate_it->get() : 0.5f; -#ifndef PREFER_CUDNN - layers.emplace_back(new DropoutLayer<__half>(do_in_tensor, do_out_tensor, blobs_buff, - rate, gpu_resource)); -#else - layers.emplace_back(new DropoutCudnnLayer<__half>(do_in_tensor, do_out_tensor, blobs_buff, - rate, gpu_resource)); -#endif - } else { - // establish out tensor - Tensor2 do_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 do_out_tensor; - blobs_buff->reserve(do_in_tensor.get_dimensions(), &do_out_tensor); - output_tensor_pairs.push_back({do_out_tensor.shrink(), input_output_info.output[0]}); - // get ELU params - auto rate_it = j.find("rate"); - auto rate = (rate_it != j.end()) ? rate_it->get() : 0.5f; -#ifndef PREFER_CUDNN - layers.emplace_back( - new DropoutLayer(do_in_tensor, do_out_tensor, blobs_buff, rate, gpu_resource)); -#else - layers.emplace_back(new DropoutCudnnLayer(do_in_tensor, do_out_tensor, blobs_buff, - rate, gpu_resource)); -#endif - } - network->enable_cuda_graph_ = false; - - break; - } - case Layer_t::ELU: { - Tensor2 elu_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - - // establish out tensor - Tensor2 elu_out_tensor; - blobs_buff->reserve(elu_in_tensor.get_dimensions(), &elu_out_tensor); - output_tensor_pairs.push_back({elu_out_tensor.shrink(), input_output_info.output[0]}); - // get ELU params - auto j_elu_hparam = get_json(j, "elu_param"); - auto alpha = get_value_from_json(j_elu_hparam, "alpha"); - layers.emplace_back(new EluLayer(elu_in_tensor, elu_out_tensor, alpha, gpu_resource)); - - break; - } - - case Layer_t::FusedInnerProduct: { - auto j_fc_param = get_json(j, "fc_param"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_fc_param, "weight_init")) { - const auto weight_init_name = get_value_from_json(j_fc_param, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - if (has_key_(j_fc_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_fc_param, "bias_init"); - Initializer_t bias_init_type; - if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } else { - initializer_types[1] = bias_init_type; - } - } - // establish out tensor - auto output = get_value_from_json(j_fc_param, "num_output"); - if (use_mixed_precision) { - Tensor2<__half> train_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - Tensor2<__half> evaluate_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); - Tensor2<__half> fc_out_tensor; - blobs_buff->reserve({(train_in_tensor.get_dimensions())[0], output}, &fc_out_tensor); - output_tensor_pairs.push_back({fc_out_tensor.shrink(), input_output_info.output[0]}); - - // establish layer - layers.emplace_back(new FusedFullyConnectedLayer( - weight_buff, weight_buff_half, wgrad_buff_half, blobs_buff, train_in_tensor, - evaluate_in_tensor, fc_out_tensor, gpu_resource, initializer_types)); - } else { - CK_THROW_(Error_t::WrongInput, "FusedInnerProduct support half only"); - } - break; - } - - case Layer_t::Cast: { - if (use_mixed_precision) { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2<__half> out_tensor; - blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - layers.emplace_back(new CastLayer(in_tensor, out_tensor, gpu_resource)); - } else { - CK_THROW_(Error_t::WrongInput, "Cast supports half only"); - } - break; - } - - case Layer_t::InnerProduct: { - auto j_fc_param = get_json(j, "fc_param"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_fc_param, "weight_init")) { - const auto weight_init_name = get_value_from_json(j_fc_param, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - if (has_key_(j_fc_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_fc_param, "bias_init"); - Initializer_t bias_init_type; - if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } else { - initializer_types[1] = bias_init_type; - } - } - - // establish out tensor - auto output = get_value_from_json(j_fc_param, "num_output"); - - if (use_mixed_precision) { - Tensor2<__half> train_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - Tensor2<__half> evaluate_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); - Tensor2<__half> fc_out_tensor; - blobs_buff->reserve({train_in_tensor.get_dimensions()[0], output}, &fc_out_tensor); - - // establish layer - layers.emplace_back(new FullyConnectedLayerHalf( - weight_buff, weight_buff_half, wgrad_buff_half, blobs_buff, train_in_tensor, - evaluate_in_tensor, fc_out_tensor, gpu_resource, initializer_types)); - output_tensor_pairs.push_back({fc_out_tensor.shrink(), input_output_info.output[0]}); - } else { - Tensor2 train_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 evaluate_in_tensor = - Tensor2::stretch_from(input_output_info.evaluate_input[0]); - Tensor2 fc_out_tensor; - blobs_buff->reserve({train_in_tensor.get_dimensions()[0], output}, &fc_out_tensor); - // establish layer - layers.emplace_back(new FullyConnectedLayer( - weight_buff, wgrad_buff, train_in_tensor, evaluate_in_tensor, fc_out_tensor, - gpu_resource, use_mixed_precision, initializer_types)); - output_tensor_pairs.push_back({fc_out_tensor.shrink(), input_output_info.output[0]}); - } - break; - } - - case Layer_t::Interaction: { - // lambda template could be a better solution here, but there's not support in c++11 - if (use_mixed_precision) { - if (gpu_resource->get_cc_major() < 7) { - CK_THROW_(Error_t::WrongInput, "InteractionLayer<__half> is not supported in SM " + - std::to_string(gpu_resource->get_cc_major()) + "." + - std::to_string(gpu_resource->get_cc_minor())); - } - - Tensor2<__half> train_in_mlp_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - Tensor2<__half> evaluate_in_mlp_tensor = - Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); - Tensor2<__half> train_in_emb_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[1]); - Tensor2<__half> evaluate_in_emb_tensor = - Tensor2<__half>::stretch_from(input_output_info.evaluate_input[1]); - Tensor2<__half> out_tensor; - - layers.emplace_back(new InteractionLayer<__half>( - train_in_mlp_tensor, evaluate_in_mlp_tensor, train_in_emb_tensor, - evaluate_in_emb_tensor, out_tensor, - blobs_buff, // todo cannot use this blobs_buff here need half - gpu_resource, use_mixed_precision)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - - } else { - Tensor2 train_in_mlp_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 evaluate_in_mlp_tensor = - Tensor2::stretch_from(input_output_info.evaluate_input[0]); - Tensor2 train_emb_tensor = - Tensor2::stretch_from(input_output_info.train_input[1]); - Tensor2 evaluate_emb_tensor = - Tensor2::stretch_from(input_output_info.evaluate_input[1]); - Tensor2 out_tensor; - layers.emplace_back(new InteractionLayer( - train_in_mlp_tensor, evaluate_in_mlp_tensor, train_emb_tensor, evaluate_emb_tensor, - out_tensor, blobs_buff, gpu_resource, use_mixed_precision)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - } - - break; - } - case Layer_t::MultiCross: { - auto j_mc_param = get_json(j, "mc_param"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_mc_param, "weight_init")) { - const auto weight_init_name = get_value_from_json(j_mc_param, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - if (has_key_(j_mc_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_mc_param, "bias_init"); - Initializer_t bias_init_type; - if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } else { - initializer_types[1] = bias_init_type; - } - } - - // establish out tensor - auto num_layers = get_value_from_json(j_mc_param, "num_layers"); - Tensor2 mc_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 out_tensor; - blobs_buff->reserve(mc_in_tensor.get_dimensions(), &out_tensor); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - // establish layer - layers.emplace_back(new MultiCrossLayer(weight_buff, wgrad_buff, blobs_buff, mc_in_tensor, - out_tensor, gpu_resource, num_layers, - initializer_types)); - break; - } - - case Layer_t::MultiCrossEntropyLoss: { - if (input_output_info.train_input.size() != 2) { - CK_THROW_(Error_t::WrongInput, "bottom of MultiCrossEntropyLoss must be two dim"); - } - - auto tweight = get_json(j, "target_weight"); - std::vector target_weight_vec; - for (auto tweight_tmp : tweight) { - float tweight_val = tweight_tmp.get(); - target_weight_vec.push_back(tweight_val); - } - - Tensor2 label_tensor = - Tensor2::stretch_from(input_output_info.train_input[1]); - blobs_buff->reserve({1, 1}, &loss_tensor); - - if (use_mixed_precision) { - Tensor2<__half> multi_cross_entropy_loss_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - loss.reset(new MultiCrossEntropyLoss<__half>( - label_tensor, multi_cross_entropy_loss_in_tensor, loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff_half->as_tensor(), - multi_cross_entropy_loss_in_tensor.get_dimensions()[0], - gpu_resource), - target_weight_vec, gpu_resource, num_networks_in_global, scaler)); - } else { - Tensor2 multi_cross_entropy_loss_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - loss.reset(new MultiCrossEntropyLoss( - label_tensor, multi_cross_entropy_loss_in_tensor, loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff->as_tensor(), - multi_cross_entropy_loss_in_tensor.get_dimensions()[0], - gpu_resource), - target_weight_vec, gpu_resource, num_networks_in_global, scaler)); - } - break; - } - case Layer_t::ReLU: { - if (use_mixed_precision) { - Tensor2<__half> relu_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - Tensor2<__half> relu_out_tensor; - blobs_buff->reserve(relu_in_tensor.get_dimensions(), &relu_out_tensor); - layers.emplace_back(new ReluLayer<__half>(relu_in_tensor, relu_out_tensor, gpu_resource)); - output_tensor_pairs.push_back({relu_out_tensor.shrink(), input_output_info.output[0]}); - } else { - // establish out tensor - Tensor2 relu_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 relu_out_tensor; - blobs_buff->reserve(relu_in_tensor.get_dimensions(), &relu_out_tensor); - layers.emplace_back(new ReluLayer(relu_in_tensor, relu_out_tensor, gpu_resource)); - output_tensor_pairs.push_back({relu_out_tensor.shrink(), input_output_info.output[0]}); - } - - break; - } - case Layer_t::Reshape: { - auto selected_it = j.find("selected"); - // selective reshape - if (selected_it != j.end()) { - std::vector selected; - nlohmann::json j_selected = (selected_it.value()); - for (auto slot_obj : j_selected) { - int slot_id = slot_obj.get(); - if (slot_id < 0) CK_THROW_(Error_t::WrongInput, "slot_id < 0"); - selected.push_back(slot_id); - } - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - Tensor2<__half> out_tensor; - layers.emplace_back(new ReshapeLayer<__half>(in_tensor, out_tensor, blobs_buff, - selected, gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - } else { - Tensor2 in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 out_tensor; - layers.emplace_back( - new ReshapeLayer(in_tensor, out_tensor, blobs_buff, selected, gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - } - } - // general purpose reshape - else { - auto leading_dim_it = j.find("leading_dim"); - - // if leading_dim is not specified, default leading_dim = n_slots * vector_length - - if (use_mixed_precision) { - Tensor2<__half> train_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - Tensor2<__half> evaluate_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); - Tensor2<__half> out_tensor; - const auto& in_dims = train_in_tensor.get_dimensions(); - size_t leading_dim = (leading_dim_it != j.end()) - ? (*leading_dim_it).get() - : train_in_tensor.get_num_elements() / in_dims[0]; - layers.emplace_back(new ReshapeLayer<__half>(train_in_tensor, evaluate_in_tensor, - out_tensor, blobs_buff, leading_dim, - gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - } else { - Tensor2 train_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 evaluate_in_tensor = - Tensor2::stretch_from(input_output_info.evaluate_input[0]); - Tensor2 out_tensor; - const auto& in_dims = train_in_tensor.get_dimensions(); - size_t leading_dim = (leading_dim_it != j.end()) - ? (*leading_dim_it).get() - : train_in_tensor.get_num_elements() / in_dims[0]; - layers.emplace_back(new ReshapeLayer(train_in_tensor, evaluate_in_tensor, - out_tensor, blobs_buff, leading_dim, - gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - } - } - break; - } - case Layer_t::Slice: { - std::vector> ranges; - auto j_ranges = get_json(j, "ranges"); - assert(j_ranges.is_array()); - for (auto j_range : j_ranges) { - assert(j_range.is_array()); - ranges.emplace_back(std::make_pair(j_range[0].get(), j_range[1].get())); - } - - if (use_mixed_precision) { - Tensor2<__half> train_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.train_input[0]); - Tensor2<__half> evaluate_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.evaluate_input[0]); - Tensors2<__half> out_tensors; - layers.emplace_back(new SliceLayer<__half>( - train_in_tensor, evaluate_in_tensor, out_tensors, blobs_buff, ranges, gpu_resource)); - for (size_t i = 0; i < out_tensors.size(); i++) { - output_tensor_pairs.push_back({out_tensors[i].shrink(), input_output_info.output[i]}); - } - } else { - Tensor2 train_in_tensor = - Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 evaluate_in_tensor = - Tensor2::stretch_from(input_output_info.evaluate_input[0]); - Tensors2 out_tensors; - layers.emplace_back(new SliceLayer(train_in_tensor, evaluate_in_tensor, - out_tensors, blobs_buff, ranges, gpu_resource)); - for (size_t i = 0; i < out_tensors.size(); i++) { - output_tensor_pairs.push_back({out_tensors[i].shrink(), input_output_info.output[i]}); - } - } - break; - } - case Layer_t::Multiply: { - std::vector weight_dims; - auto dims = get_json(j, "weight_dims"); - assert(dims.is_array()); - for (auto dim : dims) { - weight_dims.emplace_back(dim.get()); - } - - // establish initializer - std::vector initializer_types(1, Initializer_t::Default); - if (has_key_(j, "weight_init")) { - const auto weight_init_name = get_value_from_json(j, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 out_tensor; - layers.emplace_back(new MultiplyLayer(weight_buff, wgrad_buff, blobs_buff, in_tensor, - out_tensor, weight_dims, gpu_resource, - initializer_types)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - break; - } - case Layer_t::FmOrder2: { - auto out_dim = get_json(j, "out_dim").get(); - - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 out_tensor; - blobs_buff->reserve({in_tensor.get_dimensions()[0], out_dim}, &out_tensor); - - layers.emplace_back(new FmOrder2Layer(in_tensor, out_tensor, gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - break; - } - case Layer_t::Add: { - if (use_mixed_precision) { - Tensors2<__half> in_tensors; - for (const auto& t : input_output_info.train_input) { - in_tensors.push_back(Tensor2<__half>::stretch_from(t)); - } - Tensor2<__half> out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - layers.emplace_back( - new AddLayer<__half>(in_tensors, out_tensor, blobs_buff, gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - } else { - Tensors2 in_tensors; - for (const auto& t : input_output_info.train_input) { - in_tensors.push_back(Tensor2::stretch_from(t)); - } - Tensor2 out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - layers.emplace_back( - new AddLayer(in_tensors, out_tensor, blobs_buff, gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - } - break; - } - case Layer_t::ReduceSum: { - int axis = get_json(j, "axis").get(); - - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.train_input[0]); - Tensor2 out_tensor; - layers.emplace_back( - new ReduceSumLayer(in_tensor, out_tensor, blobs_buff, axis, gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - break; - } - case Layer_t::DotProduct: { - Tensors2 in_tensors; - for (const auto& t : input_output_info.train_input) { - in_tensors.push_back(Tensor2::stretch_from(t)); - } - Tensor2 out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - layers.emplace_back(new DotProductLayer(in_tensors, out_tensor, blobs_buff, gpu_resource)); - output_tensor_pairs.push_back({out_tensor.shrink(), input_output_info.output[0]}); - break; - } - default: - assert(!"Error: no such layer && should never get here!"); - } // end of switch - - if (!(layer_type == Layer_t::CrossEntropyLoss || - layer_type == Layer_t::BinaryCrossEntropyLoss || - layer_type == Layer_t::MultiCrossEntropyLoss)) { - for (auto& output_tensor_pair : output_tensor_pairs) { - add_tensor_to_network(output_tensor_pair, tensor_entries); - } - } else { - network->raw_metrics_[metrics::RawType::Loss] = loss_tensor.shrink(); - network->raw_metrics_[metrics::RawType::Pred] = input_output_info.evaluate_input[0]; - network->raw_metrics_[metrics::RawType::Label] = input_output_info.evaluate_input[1]; - } - } // for layers - - // create optimizer - auto opt_param = get_optimizer_param(j_optimizer); - - network->optimizer_ = std::move(Optimizer::Create( - opt_param, weight_buff->as_tensor(), wgrad_buff->as_tensor(), wgrad_buff_half->as_tensor(), - use_mixed_precision, scaler, blobs_buff, gpu_resource)); - - network->weight_tensor_ = weight_buff->as_tensor(); - network->wgrad_tensor_ = wgrad_buff->as_tensor(); - network->weight_tensor_half_ = weight_buff_half->as_tensor(); - network->wgrad_tensor_half_ = wgrad_buff_half->as_tensor(); - - CudaDeviceContext context(gpu_resource->get_device_id()); - blobs_buff->allocate(); - - return network.release(); -} +Parser::Parser(const std::string& configure_file, size_t batch_size, size_t batch_size_eval, + bool repeat_dataset, bool i64_input_key, bool use_mixed_precision, float scaler, + bool use_algorithm_search, bool use_cuda_graph) + : config_(read_json_file(configure_file)), + batch_size_(batch_size), + batch_size_eval_(batch_size_eval), + repeat_dataset_(repeat_dataset), + i64_input_key_(i64_input_key), + use_mixed_precision_(use_mixed_precision), + scaler_(scaler), + use_algorithm_search_(use_algorithm_search), + use_cuda_graph_(use_cuda_graph) {} + +Parser::Parser(const nlohmann::json& config) + : config_(config), + batch_size_(1), + batch_size_eval_(1), + repeat_dataset_(false), + i64_input_key_(false), + use_mixed_precision_(false), + scaler_(1.0f), + use_algorithm_search_(true), + use_cuda_graph_(true) {} template -static void parse_data_layer(const nlohmann::json& j, int& label_dim, int& dense_dim, - Check_t& check_type, std::string& source_data, - std::vector& data_reader_sparse_param_array, - std::string& eval_source, std::string& top_strs_label, - std::string& top_strs_dense, std::vector& sparse_names, - std::map>& sparse_input_map) { - source_data = get_value_from_json(j, "source"); - - auto j_label = get_json(j, "label"); - top_strs_label = get_value_from_json(j_label, "top"); - label_dim = get_value_from_json(j_label, "label_dim"); - - auto j_dense = get_json(j, "dense"); - top_strs_dense = get_value_from_json(j_dense, "top"); - dense_dim = get_value_from_json(j_dense, "dense_dim"); - - const std::map CHECK_TYPE_MAP = {{"Sum", Check_t::Sum}, - {"None", Check_t::None}}; - - const auto check_str = get_value_from_json(j, "check"); - if (!find_item_in_map(check_type, check_str, CHECK_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "Not supported check type: " + check_str); - } - - const std::map DATA_TYPE_MAP = { - {"DistributedSlot", DataReaderSparse_t::Distributed}, - {"LocalizedSlot", DataReaderSparse_t::Localized}, - }; - - auto j_sparse = get_json(j, "sparse"); - for (unsigned int i = 0; i < j_sparse.size(); i++) { - DataReaderSparseParam param; - - const nlohmann::json& js = j_sparse[i]; - const auto sparse_name = get_value_from_json(js, "top"); - const auto data_type_name = get_value_from_json(js, "type"); - if (!find_item_in_map(param.type, data_type_name, DATA_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "Not supported data type: " + data_type_name); - } - param.max_feature_num = get_value_from_json(js, "max_feature_num_per_sample"); - param.max_nnz = get_value_from_json_soft(js, "max_nnz", param.max_feature_num); - param.slot_num = get_value_from_json(js, "slot_num"); - data_reader_sparse_param_array.push_back(param); - SparseInput sparse_input(param.slot_num, param.max_feature_num); - sparse_input_map.emplace(sparse_name, sparse_input); - sparse_names.push_back(sparse_name); - } - FIND_AND_ASSIGN_STRING_KEY(eval_source, j); -} - -void parse_data_layer_helper(const nlohmann::json& j, int& label_dim, int& dense_dim, - Check_t& check_type, std::string& source_data, - std::vector& data_reader_sparse_param_array, - std::string& eval_source, std::string& top_strs_label, - std::string& top_strs_dense, std::vector& sparse_names, - std::map>& sparse_input_map) { - parse_data_layer(j, label_dim, dense_dim, check_type, source_data, data_reader_sparse_param_array, - eval_source, top_strs_label, top_strs_dense, sparse_names, sparse_input_map); -} - -template -static void create_embedding(std::map>& sparse_input_map, - std::vector* tensor_entries_list, - std::vector>& embedding, - Embedding_t embedding_type, const nlohmann::json& config, - const std::shared_ptr& resource_manager, - size_t batch_size, size_t batch_size_eval, bool use_mixed_precision, - float scaler, const nlohmann::json& j_layers) { - - auto j_optimizer = get_json(config, "optimizer"); - auto embedding_name = get_value_from_json(j_layers, "type"); - - auto bottom_name = get_value_from_json(j_layers, "bottom"); - auto top_name = get_value_from_json(j_layers, "top"); - - auto j_hparam = get_json(j_layers, "sparse_embedding_hparam"); - size_t max_vocabulary_size_per_gpu = 0; - if (embedding_type == Embedding_t::DistributedSlotSparseEmbeddingHash) { - max_vocabulary_size_per_gpu = - get_value_from_json(j_hparam, "max_vocabulary_size_per_gpu"); - } else if (embedding_type == Embedding_t::LocalizedSlotSparseEmbeddingHash) { - if (has_key_(j_hparam, "max_vocabulary_size_per_gpu")) { - max_vocabulary_size_per_gpu = - get_value_from_json(j_hparam, "max_vocabulary_size_per_gpu"); - } else if (!has_key_(j_hparam, "slot_size_array")) { - CK_THROW_(Error_t::WrongInput, - "No max_vocabulary_size_per_gpu or slot_size_array in: " + embedding_name); - } - } - auto embedding_vec_size = get_value_from_json(j_hparam, "embedding_vec_size"); - auto combiner = get_value_from_json(j_hparam, "combiner"); - - SparseInput sparse_input; - if (!find_item_in_map(sparse_input, bottom_name, sparse_input_map)) { - CK_THROW_(Error_t::WrongInput, "Cannot find bottom"); - } - - OptParams embedding_opt_params; - if (has_key_(j_layers, "optimizer")) { - embedding_opt_params = get_optimizer_param(get_json(j_layers, "optimizer")); - } else { - embedding_opt_params = get_optimizer_param(j_optimizer); - } - embedding_opt_params.scaler = scaler; - - switch (embedding_type) { - case Embedding_t::DistributedSlotSparseEmbeddingHash: { - const SparseEmbeddingHashParams embedding_params = { - batch_size, - batch_size_eval, - max_vocabulary_size_per_gpu, - {}, - embedding_vec_size, - sparse_input.max_feature_num_per_sample, - sparse_input.slot_num, - combiner, // combiner: 0-sum, 1-mean - embedding_opt_params}; - - embedding.emplace_back(new DistributedSlotSparseEmbeddingHash( - sparse_input.train_row_offsets, sparse_input.train_values, sparse_input.train_nnz, - sparse_input.evaluate_row_offsets, sparse_input.evaluate_values, - sparse_input.evaluate_nnz, embedding_params, resource_manager)); - break; - } - case Embedding_t::LocalizedSlotSparseEmbeddingHash: { -#ifndef NCCL_A2A - - auto j_plan = get_json(j_layers, "plan_file"); - std::string plan_file; - if (j_plan.is_array()) { - int num_nodes = j_plan.size(); - if (num_nodes != resource_manager->get_num_process()) { - CK_THROW_(Error_t::WrongInput, "num_nodes != num_procs"); - } - plan_file = j_plan[resource_manager->get_process_id()].get(); - } else { - if (resource_manager->get_num_process() > 1) { - CK_THROW_(Error_t::WrongInput, "num_procs > 1"); - } - plan_file = get_value_from_json(j_layers, "plan_file"); - } - - std::ifstream ifs(plan_file); - if (!ifs) { - CK_THROW_(Error_t::WrongInput, "plan file " + plan_file + " can bot be open"); - } -#else - std::string plan_file = ""; -#endif - std::vector slot_size_array; - if (has_key_(j_hparam, "slot_size_array")) { - auto slots = get_json(j_hparam, "slot_size_array"); - assert(slots.is_array()); - for (auto slot : slots) { - slot_size_array.emplace_back(slot.get()); - } - } - - const SparseEmbeddingHashParams embedding_params = { - batch_size, - batch_size_eval, - max_vocabulary_size_per_gpu, - slot_size_array, - embedding_vec_size, - sparse_input.max_feature_num_per_sample, - sparse_input.slot_num, - combiner, // combiner: 0-sum, 1-mean - embedding_opt_params}; - - embedding.emplace_back(new LocalizedSlotSparseEmbeddingHash( - sparse_input.train_row_offsets, sparse_input.train_values, sparse_input.train_nnz, - sparse_input.evaluate_row_offsets, sparse_input.evaluate_values, - sparse_input.evaluate_nnz, embedding_params, plan_file, resource_manager)); - - break; - } - case Embedding_t::LocalizedSlotSparseEmbeddingOneHot: { - std::string plan_file = ""; - std::vector slot_size_array; - auto slots = get_json(j_hparam, "slot_size_array"); - assert(slots.is_array()); - for (auto slot : slots) { - slot_size_array.emplace_back(slot.get()); - } - - const SparseEmbeddingHashParams embedding_params = { - batch_size, - batch_size_eval, - 0, - slot_size_array, - embedding_vec_size, - sparse_input.max_feature_num_per_sample, - sparse_input.slot_num, - combiner, // combiner: 0-sum, 1-mean - embedding_opt_params}; - - embedding.emplace_back(new LocalizedSlotSparseEmbeddingOneHot( - sparse_input.train_row_offsets, sparse_input.train_values, sparse_input.train_nnz, - sparse_input.evaluate_row_offsets, sparse_input.evaluate_values, - sparse_input.evaluate_nnz, embedding_params, plan_file, resource_manager)); - - break; - } - } // switch - for (size_t i = 0; i < resource_manager->get_local_gpu_count(); i++) { - tensor_entries_list[i].push_back( - {top_name, TensorUse::Train, (embedding.back()->get_train_output_tensors())[i]}); - tensor_entries_list[i].push_back( - {top_name, TensorUse::Evaluate, (embedding.back()->get_evaluate_output_tensors())[i]}); - } -} - - -template -static void create_pipeline_internal(std::shared_ptr& data_reader, - std::shared_ptr& data_reader_eval, - std::vector>& embedding, - std::vector>& network, - const std::shared_ptr& resource_manager, - Parser& parser) { +void Parser::create_pipeline_internal(std::shared_ptr& data_reader, + std::shared_ptr& data_reader_eval, + std::vector>& embedding, + std::vector>& network, + const std::shared_ptr& resource_manager) { try { - nlohmann::json config = parser.config_; - size_t batch_size = parser.batch_size_; - size_t batch_size_eval = parser.batch_size_eval_; - bool use_mixed_precision = parser.use_mixed_precision_; - float scaler = parser.scaler_; - bool use_algorithm_search = parser.use_algorithm_search_; - bool use_cuda_graph = parser.use_cuda_graph_; + const nlohmann::json& config = config_; + size_t batch_size = batch_size_; + size_t batch_size_eval = batch_size_eval_; + bool use_mixed_precision = use_mixed_precision_; + float scaler = scaler_; + bool use_algorithm_search = use_algorithm_search_; + bool use_cuda_graph = use_cuda_graph_; std::map> sparse_input_map; std::vector tensor_entries_list[resource_manager->get_local_gpu_count()]; @@ -1212,177 +116,9 @@ static void create_pipeline_internal(std::shared_ptr& data_reader, // Create Data Reader { const nlohmann::json& j = j_layers_array[0]; - const auto layer_type_name = get_value_from_json(j, "type"); - if (layer_type_name.compare("Data") != 0) { - CK_THROW_(Error_t::WrongInput, "the first layer is not Data layer:" + layer_type_name); - } - - const std::map DATA_READER_MAP = { - {"Norm", DataReaderType_t::Norm}, - {"Raw", DataReaderType_t::Raw}, - {"Parquet", DataReaderType_t::Parquet}}; - - DataReaderType_t format = DataReaderType_t::Norm; - if (has_key_(j, "format")) { - const auto data_format_name = get_value_from_json(j, "format"); - if (!find_item_in_map(format, data_format_name, DATA_READER_MAP)) { - CK_THROW_(Error_t::WrongInput, "No such data format: " + data_format_name); - } - } - - auto cache_eval_data = get_value_from_json_soft(j, "cache_eval_data", 0); - - std::string source_data = get_value_from_json(j, "source"); - - auto j_label = get_json(j, "label"); - auto top_strs_label = get_value_from_json(j_label, "top"); - auto label_dim = get_value_from_json(j_label, "label_dim"); - - auto j_dense = get_json(j, "dense"); - auto top_strs_dense = get_value_from_json(j_dense, "top"); - auto dense_dim = get_value_from_json(j_dense, "dense_dim"); - - const std::map CHECK_TYPE_MAP = {{"Sum", Check_t::Sum}, - {"None", Check_t::None}}; - - Check_t check_type; - const auto check_str = get_value_from_json(j, "check"); - if (!find_item_in_map(check_type, check_str, CHECK_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "Not supported check type: " + check_str); - } - - std::vector data_reader_sparse_param_array; - - const std::map DATA_TYPE_MAP = { - {"DistributedSlot", DataReaderSparse_t::Distributed}, - {"LocalizedSlot", DataReaderSparse_t::Localized}, - }; - - auto j_sparse = get_json(j, "sparse"); - std::vector sparse_names; - - for (unsigned int i = 0; i < j_sparse.size(); i++) { - DataReaderSparseParam param; - - const nlohmann::json& js = j_sparse[i]; - const auto sparse_name = get_value_from_json(js, "top"); - const auto data_type_name = get_value_from_json(js, "type"); - if (!find_item_in_map(param.type, data_type_name, DATA_TYPE_MAP)) { - CK_THROW_(Error_t::WrongInput, "Not supported data type: " + data_type_name); - } - param.max_feature_num = get_value_from_json(js, "max_feature_num_per_sample"); - param.max_nnz = get_value_from_json_soft(js, "max_nnz", param.max_feature_num); - param.slot_num = get_value_from_json(js, "slot_num"); - data_reader_sparse_param_array.push_back(param); - SparseInput sparse_input(param.slot_num, param.max_feature_num); - sparse_input_map.emplace(sparse_name, sparse_input); - sparse_names.push_back(sparse_name); - } - - data_reader_eval = nullptr; - std::string eval_source; - FIND_AND_ASSIGN_STRING_KEY(eval_source, j); - -#ifdef VAL - const int NUM_THREADS = 1; -#else - const int NUM_THREADS = - format == DataReaderType_t::Parquet ? resource_manager->get_local_gpu_count() : 12; -#endif - - DataReader *data_reader_tk = new DataReader( - batch_size, label_dim, dense_dim, - data_reader_sparse_param_array, - resource_manager, - parser.repeat_dataset_, - NUM_THREADS, use_mixed_precision, false); - data_reader.reset(data_reader_tk); - DataReader *data_reader_eval_tk = new DataReader( - batch_size_eval, label_dim, dense_dim, - data_reader_sparse_param_array, - resource_manager, - parser.repeat_dataset_, - NUM_THREADS, use_mixed_precision, cache_eval_data); - data_reader_eval.reset(data_reader_eval_tk); - - auto f = [&j]() -> std::vector { - std::vector slot_offset; - if (has_key_(j, "slot_size_array")) { - auto slot_size_array = get_json(j, "slot_size_array"); - if (!slot_size_array.is_array()) { - CK_THROW_(Error_t::WrongInput, "!slot_size_array.is_array()"); - } - long long slot_sum = 0; - for (auto j_slot_size : slot_size_array) { - slot_offset.push_back(slot_sum); - long long slot_size = j_slot_size.get(); - slot_sum += slot_size; - } - MESSAGE_("Vocabulary size: " + std::to_string(slot_sum)); - } - return slot_offset; - }; - - switch (format) { - case DataReaderType_t::Norm: { - bool start_right_now = parser.repeat_dataset_; - data_reader->create_drwg_norm( - source_data, check_type, start_right_now); - data_reader_eval->create_drwg_norm( - eval_source, check_type, start_right_now); - break; - } - case DataReaderType_t::Raw: { - const auto num_samples = get_value_from_json(j, "num_samples"); - const auto eval_num_samples = get_value_from_json(j, "eval_num_samples"); - std::vector slot_offset = f(); - bool float_label_dense = get_value_from_json_soft(j, "float_label_dense", false); - data_reader->create_drwg_raw(source_data, num_samples, slot_offset, float_label_dense, - true, false); - data_reader_eval->create_drwg_raw(eval_source, eval_num_samples, slot_offset, - float_label_dense, false, false); - - break; - } - case DataReaderType_t::Parquet: { - // @Future: Should be slot_offset here and data_reader ctor should - // be TypeKey not long long - std::vector slot_offset = f(); - data_reader->create_drwg_parquet(source_data, slot_offset, true); - data_reader_eval->create_drwg_parquet(eval_source, slot_offset, true); - break; - } - default: { assert(!"Error: no such option && should never get here!"); } - } - - for (size_t i = 0; i < resource_manager->get_local_gpu_count(); i++) { - tensor_entries_list[i].push_back( - {top_strs_label, TensorUse::Train, data_reader_tk->get_label_tensors()[i].shrink()}); - tensor_entries_list[i].push_back({top_strs_label, TensorUse::Evaluate, - data_reader_eval_tk->get_label_tensors()[i].shrink()}); - - if (use_mixed_precision) { - tensor_entries_list[i].push_back( - {top_strs_dense, TensorUse::Train, data_reader_tk->get_dense_tensors()[i]}); - tensor_entries_list[i].push_back( - {top_strs_dense, TensorUse::Evaluate, data_reader_eval_tk->get_dense_tensors()[i]}); - } else { - tensor_entries_list[i].push_back( - {top_strs_dense, TensorUse::Train, data_reader_tk->get_dense_tensors()[i]}); - tensor_entries_list[i].push_back( - {top_strs_dense, TensorUse::Evaluate, data_reader_eval_tk->get_dense_tensors()[i]}); - } - } - - for (unsigned int i = 0; i < j_sparse.size(); i++) { - const auto& sparse_input = sparse_input_map.find(sparse_names[i]); - sparse_input->second.train_row_offsets = data_reader_tk->get_row_offsets_tensors(i); - sparse_input->second.train_values = data_reader_tk->get_value_tensors(i); - sparse_input->second.train_nnz = data_reader_tk->get_nnz_array(i); - sparse_input->second.evaluate_row_offsets = data_reader_eval_tk->get_row_offsets_tensors(i); - sparse_input->second.evaluate_values = data_reader_eval_tk->get_value_tensors(i); - sparse_input->second.evaluate_nnz = data_reader_eval_tk->get_nnz_array(i); - } + create_datareader()(j, sparse_input_map, tensor_entries_list, data_reader, + data_reader_eval, batch_size, batch_size_, use_mixed_precision, + repeat_dataset_, resource_manager); } // Create Embedding @@ -1402,13 +138,13 @@ static void create_pipeline_internal(std::shared_ptr& data_reader, } if (use_mixed_precision) { - create_embedding(sparse_input_map, tensor_entries_list, embedding, - embedding_type, config, resource_manager, batch_size, - batch_size_eval, use_mixed_precision, scaler, j); + create_embedding()( + sparse_input_map, tensor_entries_list, embedding, embedding_type, config, + resource_manager, batch_size, batch_size_eval, use_mixed_precision, scaler, j); } else { - create_embedding(sparse_input_map, tensor_entries_list, embedding, - embedding_type, config, resource_manager, batch_size, - batch_size_eval, use_mixed_precision, scaler, j); + create_embedding()(sparse_input_map, tensor_entries_list, embedding, + embedding_type, config, resource_manager, batch_size, + batch_size_eval, use_mixed_precision, scaler, j); } } // for () } // Create Embedding @@ -1419,10 +155,10 @@ static void create_pipeline_internal(std::shared_ptr& data_reader, CK_THROW_(Error_t::WrongInput, "0 != batch_size\%total_gpu_count"); } for (size_t i = 0; i < resource_manager->get_local_gpu_count(); i++) { - network.emplace_back(create_network(j_layers_array, j_optimizer, tensor_entries_list[i], - total_gpu_count, resource_manager->get_local_cpu(), - resource_manager->get_local_gpu(i), use_mixed_precision, - scaler, use_algorithm_search, use_cuda_graph)); + network.emplace_back(Network::create_network( + j_layers_array, j_optimizer, tensor_entries_list[i], total_gpu_count, + resource_manager->get_local_cpu(), resource_manager->get_local_gpu(i), + use_mixed_precision, scaler, use_algorithm_search, use_cuda_graph, false)); } } @@ -1430,10 +166,8 @@ static void create_pipeline_internal(std::shared_ptr& data_reader, std::cerr << rt_err.what() << std::endl; throw; } - } - void Parser::create_pipeline(std::shared_ptr& data_reader, std::shared_ptr& data_reader_eval, std::vector>& embedding, @@ -1441,12 +175,60 @@ void Parser::create_pipeline(std::shared_ptr& data_reader, const std::shared_ptr& resource_manager) { if (i64_input_key_) { create_pipeline_internal(data_reader, data_reader_eval, embedding, network, - resource_manager, *this); + resource_manager); } else { create_pipeline_internal(data_reader, data_reader_eval, embedding, network, - resource_manager, *this); + resource_manager); } } +template +void Parser::create_pipeline_inference(const InferenceParser& inference_parser, Tensor2& dense_input, + std::vector>>& rows, + std::vector>>& embeddingvecs, + std::vector& embedding_table_slot_size, + std::vector>* embeddings, + Network** network, + const std::shared_ptr resource_manager) { + //std::vector tensor_entries; + + auto j_layers_array = get_json(config_, "layers"); + + auto input_buffer = GeneralBuffer2::create(); + + { + const nlohmann::json& j_data = j_layers_array[0]; + auto j_dense = get_json(j_data, "dense"); + auto top_strs_dense = get_value_from_json(j_dense, "top"); + auto dense_dim = get_value_from_json(j_dense, "dense_dim"); + + input_buffer->reserve({inference_parser.max_batchsize, dense_dim}, &dense_input); + tensor_entries.push_back({top_strs_dense, TensorUse::General, dense_input.shrink()}); + } + + create_embedding()(inference_parser, j_layers_array, rows, embeddingvecs, embedding_table_slot_size, &tensor_entries, + embeddings, resource_manager->get_local_gpu(0), input_buffer); + input_buffer->allocate(); + + //create network + *network = Network::create_network( + j_layers_array, "", tensor_entries, 1, resource_manager->get_local_cpu(), + resource_manager->get_local_gpu(0), inference_parser.use_mixed_precision, inference_parser.scaler, false, inference_parser.use_cuda_graph, true); +} + +void Parser::create_pipeline(const InferenceParser& inference_parser, Tensor2& dense_input, + std::vector>>& rows, + std::vector>>& embeddingvecs, + std::vector& embedding_table_slot_size, + std::vector>* embeddings, Network** network, + const std::shared_ptr resource_manager) { + if (inference_parser.use_mixed_precision) { + create_pipeline_inference<__half>(inference_parser, dense_input, rows, embeddingvecs, embedding_table_slot_size, embeddings, network, + resource_manager); + } else { + create_pipeline_inference(inference_parser, dense_input, rows, embeddingvecs, embedding_table_slot_size, embeddings, network, + resource_manager); + } +} } // namespace HugeCTR