From 038c53872f20b533ae4259c9307112c232d8801a Mon Sep 17 00:00:00 2001 From: Minseok Lee Date: Mon, 11 Sep 2023 07:32:47 -0700 Subject: [PATCH] Remove the offline inference in the code level --- .gitlab-ci.yml | 16 +- CMakeLists.txt | 11 +- HugeCTR/include/core23_network.hpp | 11 - HugeCTR/include/cpu/create_embedding_cpu.hpp | 35 - HugeCTR/include/cpu/create_pipeline_cpu.hpp | 43 - .../cpu/embedding_feature_combiner_cpu.hpp | 81 - HugeCTR/include/cpu/inference_session_cpu.hpp | 64 - HugeCTR/include/cpu/layer_cpu.hpp | 60 - HugeCTR/include/cpu/layers/add_layer_cpu.hpp | 75 - .../cpu/layers/batch_norm_layer_cpu.hpp | 95 - HugeCTR/include/cpu/layers/cast_layer_cpu.hpp | 33 - .../include/cpu/layers/concat_layer_cpu.hpp | 89 - .../include/cpu/layers/dropout_layer_cpu.hpp | 74 - .../cpu/layers/element_wise_function_cpu.hpp | 98 - .../layers/elementwise_multiply_layer_cpu.hpp | 69 - HugeCTR/include/cpu/layers/elu_layer_cpu.hpp | 68 - .../cpu/layers/fm_order2_layer_cpu.hpp | 75 - .../cpu/layers/fully_connected_layer_cpu.hpp | 85 - .../layers/fully_connected_layer_half_cpu.hpp | 89 - .../fused_fully_connected_layer_cpu.hpp | 92 - .../cpu/layers/interaction_layer_cpu.hpp | 78 - .../cpu/layers/multi_cross_layer_cpu.hpp | 68 - .../cpu/layers/reduce_sum_layer_cpu.hpp | 74 - HugeCTR/include/cpu/layers/relu_layer_cpu.hpp | 57 - .../include/cpu/layers/reshape_layer_cpu.hpp | 97 - .../include/cpu/layers/sigmoid_layer_cpu.hpp | 57 - .../include/cpu/layers/slice_layer_cpu.hpp | 81 - .../cpu/layers/weight_multiply_layer_cpu.hpp | 83 - HugeCTR/include/cpu/network_cpu.hpp | 111 -- .../inference/embedding_feature_combiner.hpp | 85 - .../include/inference/inference_session.hpp | 94 - .../inference/inference_session_base.hpp | 43 - HugeCTR/include/network.hpp | 25 - HugeCTR/include/parser.hpp | 84 - HugeCTR/include/pybind/hps_wrapper.hpp | 128 ++ HugeCTR/include/pybind/inference_model.hpp | 93 - HugeCTR/include/pybind/inference_wrapper.hpp | 661 ------- HugeCTR/src/CMakeLists.txt | 1 + HugeCTR/src/core23_network.cpp | 44 - HugeCTR/src/cpu/CMakeLists.txt | 55 - HugeCTR/src/cpu/create_embedding_cpu.cpp | 101 - HugeCTR/src/cpu/create_network_cpu.cpp | 688 ------- HugeCTR/src/cpu/create_pipeline_cpu.cpp | 94 - .../cpu/embedding_feature_combiner_cpu.cpp | 137 -- HugeCTR/src/cpu/inference_session_cpu.cpp | 155 -- HugeCTR/src/cpu/layers/add_layer_cpu.cpp | 126 -- .../src/cpu/layers/batch_norm_layer_cpu.cpp | 239 --- HugeCTR/src/cpu/layers/cast_layer_cpu.cpp | 62 - HugeCTR/src/cpu/layers/concat_layer_cpu.cpp | 119 -- HugeCTR/src/cpu/layers/dropout_layer_cpu.cpp | 84 - .../layers/elementwise_multiply_layer_cpu.cpp | 123 -- HugeCTR/src/cpu/layers/elu_layer_cpu.cpp | 71 - .../src/cpu/layers/fm_order2_layer_cpu.cpp | 150 -- .../cpu/layers/fully_connected_layer_cpu.cpp | 131 -- .../layers/fully_connected_layer_half_cpu.cpp | 136 -- .../fused_fully_connected_layer_cpu.cpp | 154 -- .../src/cpu/layers/interaction_layer_cpu.cpp | 201 -- .../src/cpu/layers/multi_cross_layer_cpu.cpp | 194 -- .../src/cpu/layers/reduce_sum_layer_cpu.cpp | 185 -- HugeCTR/src/cpu/layers/relu_layer_cpu.cpp | 73 - HugeCTR/src/cpu/layers/reshape_layer_cpu.cpp | 147 -- HugeCTR/src/cpu/layers/sigmoid_layer_cpu.cpp | 81 - HugeCTR/src/cpu/layers/slice_layer_cpu.cpp | 126 -- .../cpu/layers/weight_multiply_layer_cpu.cpp | 130 -- HugeCTR/src/cpu/network_cpu.cpp | 64 - HugeCTR/src/inference/CMakeLists.txt | 133 -- .../inference/embedding_feature_combiner.cu | 254 --- HugeCTR/src/inference/inference_session.cpp | 310 --- HugeCTR/src/inference_benchmark/metrics.cpp | 1 - HugeCTR/src/network.cpp | 43 - HugeCTR/src/optimizers/sparse_optimizer.cu | 1 - HugeCTR/src/parsers/create_datareader.cpp | 246 --- HugeCTR/src/parsers/create_network.cpp | 1696 ----------------- HugeCTR/src/parsers/create_optimizer.cpp | 121 -- HugeCTR/src/parsers/inference_parser.cpp | 278 --- HugeCTR/src/pybind/inference_model.cpp | 388 ---- HugeCTR/src/pybind/model.cpp | 8 +- HugeCTR/src/pybind/module_main.cpp | 2 - ci/benchmark/hps_memory_check/test.sh | 6 +- ci/benchmark/inference_benchmark/ci.yml | 8 +- ci/benchmark/inference_benchmark/test.sh | 4 +- ci/common/config_pbtxt_template.txt | 11 +- ci/common/generate_inference_config.py | 2 +- ci/common/ps_template.json | 19 +- ci/dracorno/ci.yml | 12 - .../inference/inference_model.sub | 17 - .../inference/inference_session.sub | 7 - ci/selene/ci.yml | 35 - ci/template.yml | 6 +- notebooks/README.md | 5 +- notebooks/multi_gpu_offline_inference.ipynb | 633 ------ .../inference_model/cross_entropy_loss.py | 187 -- .../inference_model/dcn_multi_hot.py | 203 -- test/inference/inference_model/dcn_one_hot.py | 203 -- .../inference_model/dlrm_mlp_one_hot.py | 158 -- .../inference_model/mmoe_inference.py | 46 - .../multi_cross_entropy_loss.py | 191 -- .../inference_model/synthetic_multi_hot.py | 195 -- .../inference_model/wdl_multi_hot.py | 202 -- test/inference/inference_model/wdl_one_hot.py | 202 -- .../inference_session/dcn_inference.py | 73 - .../movielens_nodense_test.py | 110 -- .../inference_session/wdl_multitable_test.py | 157 -- test/notebook_test/notebook_hugectr.py | 11 - .../hugectr2onnx_dcn_test.py | 23 +- .../hugectr2onnx_din_test.py | 49 +- .../hugectr2onnx_mmoe_test.py | 26 +- .../hugectr2onnx_ncf_test.py | 22 +- .../hugectr2onnx_wdl_test.py | 20 +- test/onnx_converter_test/train_scripts/dcn.py | 11 +- .../train_scripts/deepfm.py | 11 +- .../train_scripts/din_parquet.py | 9 +- .../train_scripts/din_try.py | 9 +- .../onnx_converter_test/train_scripts/dlrm.py | 11 +- .../train_scripts/dlrm_mlp.py | 11 +- test/onnx_converter_test/train_scripts/gmf.py | 11 +- .../train_scripts/mmoe_parquet.py | 13 +- test/onnx_converter_test/train_scripts/ncf.py | 11 +- .../train_scripts/neumf.py | 11 +- test/onnx_converter_test/train_scripts/wdl.py | 11 +- test/pybind_test/wdl_fp16_8gpu.py | 2 +- test/utest/inference/CMakeLists.txt | 29 - test/utest/inference/cpu_inference_test.cpp | 345 ---- .../inference/cpu_multicross_layer_test.cpp | 226 --- .../embedding_feature_combiner_test.cpp | 189 -- .../inference/preallocated_buffer2_test.cpp | 139 -- .../inference/session_inference_test.cpp | 383 ---- 127 files changed, 279 insertions(+), 14530 deletions(-) delete mode 100644 HugeCTR/include/cpu/create_embedding_cpu.hpp delete mode 100644 HugeCTR/include/cpu/create_pipeline_cpu.hpp delete mode 100644 HugeCTR/include/cpu/embedding_feature_combiner_cpu.hpp delete mode 100644 HugeCTR/include/cpu/inference_session_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/add_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/batch_norm_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/cast_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/concat_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/dropout_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/element_wise_function_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/elementwise_multiply_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/elu_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/fm_order2_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/fully_connected_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/fully_connected_layer_half_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/fused_fully_connected_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/interaction_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/multi_cross_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/reduce_sum_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/relu_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/reshape_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/sigmoid_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/slice_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/layers/weight_multiply_layer_cpu.hpp delete mode 100644 HugeCTR/include/cpu/network_cpu.hpp delete mode 100644 HugeCTR/include/inference/embedding_feature_combiner.hpp delete mode 100644 HugeCTR/include/inference/inference_session.hpp delete mode 100644 HugeCTR/include/inference/inference_session_base.hpp delete mode 100644 HugeCTR/include/pybind/inference_model.hpp delete mode 100644 HugeCTR/include/pybind/inference_wrapper.hpp delete mode 100644 HugeCTR/src/cpu/CMakeLists.txt delete mode 100644 HugeCTR/src/cpu/create_embedding_cpu.cpp delete mode 100644 HugeCTR/src/cpu/create_network_cpu.cpp delete mode 100644 HugeCTR/src/cpu/create_pipeline_cpu.cpp delete mode 100644 HugeCTR/src/cpu/embedding_feature_combiner_cpu.cpp delete mode 100644 HugeCTR/src/cpu/inference_session_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/add_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/cast_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/concat_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/dropout_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/elementwise_multiply_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/elu_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/fully_connected_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/multi_cross_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/reduce_sum_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/relu_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/reshape_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/sigmoid_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/slice_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/layers/weight_multiply_layer_cpu.cpp delete mode 100644 HugeCTR/src/cpu/network_cpu.cpp delete mode 100644 HugeCTR/src/inference/CMakeLists.txt delete mode 100644 HugeCTR/src/inference/embedding_feature_combiner.cu delete mode 100644 HugeCTR/src/inference/inference_session.cpp delete mode 100644 HugeCTR/src/parsers/create_datareader.cpp delete mode 100644 HugeCTR/src/parsers/create_network.cpp delete mode 100644 HugeCTR/src/parsers/create_optimizer.cpp delete mode 100644 HugeCTR/src/parsers/inference_parser.cpp delete mode 100644 HugeCTR/src/pybind/inference_model.cpp delete mode 100644 ci/integration_test/inference/inference_model.sub delete mode 100755 ci/integration_test/inference/inference_session.sub delete mode 100755 notebooks/multi_gpu_offline_inference.ipynb delete mode 100644 test/inference/inference_model/cross_entropy_loss.py delete mode 100644 test/inference/inference_model/dcn_multi_hot.py delete mode 100644 test/inference/inference_model/dcn_one_hot.py delete mode 100644 test/inference/inference_model/dlrm_mlp_one_hot.py delete mode 100644 test/inference/inference_model/mmoe_inference.py delete mode 100644 test/inference/inference_model/multi_cross_entropy_loss.py delete mode 100644 test/inference/inference_model/synthetic_multi_hot.py delete mode 100644 test/inference/inference_model/wdl_multi_hot.py delete mode 100644 test/inference/inference_model/wdl_one_hot.py delete mode 100644 test/inference/inference_session/dcn_inference.py delete mode 100644 test/inference/inference_session/movielens_nodense_test.py delete mode 100755 test/inference/inference_session/wdl_multitable_test.py delete mode 100755 test/utest/inference/CMakeLists.txt delete mode 100644 test/utest/inference/cpu_inference_test.cpp delete mode 100644 test/utest/inference/cpu_multicross_layer_test.cpp delete mode 100644 test/utest/inference/embedding_feature_combiner_test.cpp delete mode 100644 test/utest/inference/preallocated_buffer2_test.cpp delete mode 100644 test/utest/inference/session_inference_test.cpp diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 64f2105809..df0f9f0eed 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -157,7 +157,8 @@ build_inference: DST_IMAGE: $INFER_IMAGE_VERSIONED CMAKE_OPTION: "-DENABLE_INFERENCE=ON -DCMAKE_BUILD_TYPE=Release -DSM=\"70;75;80;90\" -DCLANGFORMAT=OFF" BUILD_HUGECTR: 1 - BUILD_HUGECTR_BACKEND: 1 + BUILD_HPS_BACKEND: 1 + HUGECTR_BACKEND_VER: main TRITON_BRANCH: ${TARGET_TRITON_BRANCH} build_sok_tf2: @@ -530,19 +531,6 @@ e2e_nvt_regression_test: DGXNNODES: 1 TEST_CMD: ./ci/integration_test/nvt/nvt_regression_test.sub -notebook_hugectr: - extends: .cluster_test_job_daily - needs: - - build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/notebook_hugectr - GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: /lustre/fsw/devtech/hpc-hugectr/criteo_1TB/day_0:/workdir/tools/day_0,/lustre/fsw/devtech/hpc-hugectr/criteo_1TB/day_1:/workdir/tools/day_1 - WALLTIME: "01:00:00" - DGXNNODES: 1 - TEST_CMD: ./ci/integration_test/notebooks/notebook_hugectr.sub - nb_hps_demo: extends: .cluster_test_job_daily needs: diff --git a/CMakeLists.txt b/CMakeLists.txt index c94ca98374..d6b4978d6d 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -117,6 +117,13 @@ find_package(OpenMP REQUIRED) find_package(Threads) option(ENABLE_MULTINODES "Enable multi-nodes training" OFF) +option(ENABLE_INFERENCE "Enable Inference" OFF) + +if(ENABLE_MULTINODES AND ENABLE_INFERENCE) + message(WARNING "Inference can be only enabled with the multi-node mode off. Set ENABLE_MULTINODES=OFF") + set(ENABLE_MULTINODES OFF) +endif() + if(ENABLE_MULTINODES) message(STATUS "Multi Node Enabled") find_package(MPI) @@ -141,7 +148,6 @@ if (KEY_HIT_RATIO) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DKEY_HIT_RATIO") endif() -option(ENABLE_INFERENCE "Enable Inference" OFF) if(ENABLE_INFERENCE) add_definitions(-DLIBCUDACXX_VERSION) endif() @@ -324,11 +330,8 @@ if(ENABLE_INFERENCE) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DENABLE_INFERENCE") add_subdirectory(HugeCTR/core23) add_subdirectory(HugeCTR/src/hps) - add_subdirectory(HugeCTR/src/inference) add_subdirectory(HugeCTR/src/inference_benchmark) - add_subdirectory(HugeCTR/src/cpu) add_subdirectory(test/utest/hps) - add_subdirectory(test/utest/inference) else() #setting binary files install path add_subdirectory(HugeCTR/src) diff --git a/HugeCTR/include/core23_network.hpp b/HugeCTR/include/core23_network.hpp index 12d9c73edd..19fe72e886 100644 --- a/HugeCTR/include/core23_network.hpp +++ b/HugeCTR/include/core23_network.hpp @@ -111,17 +111,6 @@ class Core23TempNetwork final { */ void upload_params_to_device(const std::string& model_file); - /** - * Read parameters from model_file. - */ - void upload_params_to_device_inference(const std::string& model_file); - - /** - * Read non-trainable parameters from model_file, e.g., running mean and running variable for - * BatchNorm - */ - void upload_non_trainable_params_to_device_inference(const std::string& model_file); - /** * Writing parameters to cpu buffer. */ diff --git a/HugeCTR/include/cpu/create_embedding_cpu.hpp b/HugeCTR/include/cpu/create_embedding_cpu.hpp deleted file mode 100644 index 938663543b..0000000000 --- a/HugeCTR/include/cpu/create_embedding_cpu.hpp +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -namespace HugeCTR { - -template -struct create_embedding_cpu { - void operator()(const InferenceParams& inference_params, const nlohmann::json& j_layers_array, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector* tensor_entries, - std::vector>* embeddings, - std::shared_ptr>& blobs_buff); -}; - -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/include/cpu/create_pipeline_cpu.hpp b/HugeCTR/include/cpu/create_pipeline_cpu.hpp deleted file mode 100644 index 8f2ea65b28..0000000000 --- a/HugeCTR/include/cpu/create_pipeline_cpu.hpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -namespace HugeCTR { - -void create_pipeline_cpu(const nlohmann::json& config, std::map tensor_active, - const InferenceParams& inference_params, Tensor2& dense_input, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embeddings, NetworkCPU** network, - const std::shared_ptr& cpu_resource); - -template -void create_pipeline_inference_cpu(const nlohmann::json& config, - const InferenceParams& inference_params, - Tensor2& dense_input, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embeddings, - NetworkCPU** network, - const std::shared_ptr& cpu_resource); - -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/include/cpu/embedding_feature_combiner_cpu.hpp b/HugeCTR/include/cpu/embedding_feature_combiner_cpu.hpp deleted file mode 100644 index 53e6fe4df5..0000000000 --- a/HugeCTR/include/cpu/embedding_feature_combiner_cpu.hpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace HugeCTR { - -enum class EmbeddingFeatureCombiner_t { Sum, Mean }; - -/** - * Combine the embedding feature vectors by Sum or Mean - * according to slot_num and row_ptrs - */ -template -class EmbeddingFeatureCombinerCPU : public LayerCPU { - /* - * stores the references to the input tensors of this layer. - */ - std::vector>> in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - /* - * stores the references to the row pointers tensors of this layer. - */ - std::vector>> row_ptrs_tensors_; - - public: - /** - * Ctor of EmbeddingFeatureCombiner. - * @param in_tensor the embedding feature tensor, must be 2D - * @param row_ptrs_tensor row pointers tensor, should be 1D (batch_size*slot_num+1,), which - * indicate which adjacent vectors belong to the same slot (i.e., feature field) - * @param out_tensor the resulting output tensor, should be 3D (batch_size, slot_num, - * embedding_vec_size) - * @param batch_size batch size - * @param slot_num slot number - * @param combiner_type combiner type for the features in the same slot, Sum or Mean - * @param blobs_buff GeneralBuffer used to create the output tensor - * @param gpu_resource available gpu resource - */ - EmbeddingFeatureCombinerCPU(const std::shared_ptr>& in_tensor, - const std::shared_ptr>& row_ptrs_tensor, - Tensor2& out_tensor, int batch_size, int slot_num, - EmbeddingFeatureCombiner_t combiner_type, - const std::shared_ptr>& blobs_buff); - ~EmbeddingFeatureCombinerCPU(){}; - - /** - * EmbeddingFeatureCombiner's combine operation - */ - void fprop(bool is_train = false) override; - - void bprop() override { - HCTR_OWN_THROW(Error_t::IllegalCall, - "The bprop() of EmbeddingFeatureCombiner is not implemented!"); - } - - private: - int batch_size_; - int slot_num_; - int embedding_vec_size_; - EmbeddingFeatureCombiner_t combiner_type_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/inference_session_cpu.hpp b/HugeCTR/include/cpu/inference_session_cpu.hpp deleted file mode 100644 index 4cb6a2d2f5..0000000000 --- a/HugeCTR/include/cpu/inference_session_cpu.hpp +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -template -class InferenceSessionCPU { - private: - nlohmann::json config_; - std::string model_name_; - std::vector embedding_table_slot_size_; - - std::vector>> row_ptrs_tensors_; - std::vector>> embedding_features_tensors_; - Tensor2 dense_input_tensor_; - - std::vector> embedding_feature_combiners_; - std::unique_ptr network_; - std::shared_ptr parameter_server_; - - void* h_keys_; - float* h_embedding_vectors_; - - std::shared_ptr cpu_resource_; - - protected: - InferenceParser inference_parser_; - InferenceParams inference_params_; - - public: - InferenceSessionCPU(const std::string& model_config_path, const InferenceParams& inference_params, - const std::shared_ptr& parameter_server); - virtual ~InferenceSessionCPU(); - void predict(float* h_dense, void* h_embeddingcolumns, int* h_row_ptrs, float* h_output, - int num_samples); -}; - -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/include/cpu/layer_cpu.hpp b/HugeCTR/include/cpu/layer_cpu.hpp deleted file mode 100644 index e27173b14c..0000000000 --- a/HugeCTR/include/cpu/layer_cpu.hpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { -/** - * @brief - * Definition of a basic layer class. - */ -class LayerCPU { - protected: - /* - * stores the weight tensors of this layer. - */ - Tensors2 weights_; - - public: - /* - * Forward pass - * @param stream: the CUDA stream that the forward function will be executed on. - */ - virtual void fprop(bool is_train) = 0; - /* - * Backward pass - * @param stream: the CUDA stream that the forward function will be executed on. - */ - virtual void bprop() = 0; - - LayerCPU() {} - LayerCPU(const LayerCPU&) = delete; - LayerCPU& operator=(const LayerCPU&) = delete; - virtual ~LayerCPU() = default; - - /* - * Some of the layers requires initialize like fully connected layer - */ - virtual void initialize() {} -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/add_layer_cpu.hpp b/HugeCTR/include/cpu/layers/add_layer_cpu.hpp deleted file mode 100644 index cff79da1da..0000000000 --- a/HugeCTR/include/cpu/layers/add_layer_cpu.hpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace HugeCTR { - -/** - * Layer which does element-wise add by input tensors. - * All the input tensors should have the same shape. - */ -template -class AddLayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - // Tensors weights_; It is inherited from Layer. - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * Ctor of AddLayer. - * @param in_tensor the input tensor - * @param out_tensor the resulting output tensor - * @param device_id the id of GPU where this layer belongs - */ - AddLayerCPU(const Tensors2& in_tensors, const Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff); - - void initialize() override; - - /** - * AddLayer's forward propagation - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * AddLayer's backward propagation - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; - - private: - int size_; - size_t num_; - Tensor2 h_inputs_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/batch_norm_layer_cpu.hpp b/HugeCTR/include/cpu/layers/batch_norm_layer_cpu.hpp deleted file mode 100644 index 6646d1c0f5..0000000000 --- a/HugeCTR/include/cpu/layers/batch_norm_layer_cpu.hpp +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace HugeCTR { - -/** - * BatchNorm layer based on cuDNN - */ -template -class BatchNormLayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - // Tensors weights_; It is inherited from Layer, and named as weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * BatchNorm parameters - */ - struct Params { - double factor; /**< moving average computation factor*/ - double eps; /**< small value to avoid divide-by-zero error*/ - }; - - /** - * Ctor of BatchNormLayer. - * @param weight_buff weight buffer for internal gamma/beta tensors - * @param wgrad_buff gradient buffer for internal gamma/beta tensors - * @param in_tensor the input tensor - * @param out_tensor the output tensor which has the same dim with in_tensor - * @param params BatchNorm parameters - * @param cudnn_handle cuDNN handle created externally - * @param device_id the id of GPU where this layer belongs - */ - BatchNormLayerCPU(const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blob_buff, - const Tensor2& in_tensor, const Tensor2& out_tensor, - const Params& params); - ~BatchNormLayerCPU() override; - - void initialize() override; - - /** - * A method of implementing the forward pass of BatchNorm - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - - /** - * A method of implementing the forward pass of BatchNorm - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; - - private: - const Params params_; - - // these four pointers are just for convenience - // they are deleted by Layer d'tor through the other pointer aliases: weight_ and wgrad_ - Tensor2 gamma_; - Tensor2 beta_; - Tensor2 gamma_grad_; - Tensor2 beta_grad_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/cast_layer_cpu.hpp b/HugeCTR/include/cpu/layers/cast_layer_cpu.hpp deleted file mode 100644 index b7b1e0b1e0..0000000000 --- a/HugeCTR/include/cpu/layers/cast_layer_cpu.hpp +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace HugeCTR { - -template -class CastLayerCPU : public LayerCPU { - Tensor2 bottom_tensor_; - Tensor2 top_tensor_; - - public: - CastLayerCPU(const Tensor2& bottom_tensor, const Tensor2& top_tensor); - void fprop(bool is_train) override; - void bprop() override; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/concat_layer_cpu.hpp b/HugeCTR/include/cpu/layers/concat_layer_cpu.hpp deleted file mode 100644 index 457e5a495a..0000000000 --- a/HugeCTR/include/cpu/layers/concat_layer_cpu.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace HugeCTR { - -/** - * Layer which merges the multiple 2D input tensors to a single 2D output tensor. - * The input tensors and the resulting output tensor must have the same dimensionallity. - * Only the innermost dimension is expanded by concatenating those of the input tensors. - * e.g., 3X(batch_size, n_slots * vector_length) to (batch_size, 3 * n_slots * vector_length), - * e.g., (batch_size, a * vector_length) + (batch_size, b * vector_length) - * to (batch_size, (a + b) * vector_length) - */ -template -class ConcatLayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - Tensors2 weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensor2 out_tensor_; - - void prop_common(bool forward, Tensors2& in_tensors, cudaStream_t stream, size_t n_sms); - template - void kernel_launch(bool forward, cudaStream_t stream, size_t n_sms, Args&... args); - - Tensors2& get_in_tensors(bool is_train); - - public: - struct InParam { - T* in; - const int in_w; - }; - - /** - * Ctor of ConcatLayer. - * @param in_tensors the vector of the input tensors - * @param out_tensor the resulting output tensor - * @param blobs_buff GeneralBuffer used to create the output tensor - * @param device_id the id of GPU where this layer belongs - */ - ConcatLayerCPU(const Tensors2& in_tensors, Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff); - ~ConcatLayerCPU() override{}; - - /** - * Concat's forward pass to gather data to the output tensor - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * Concat's backward pass to scatter data to the input tensors - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; - - private: - std::vector set_in_params(Tensors2& in_tensors, int n); - Tensor2 h_inputs_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/dropout_layer_cpu.hpp b/HugeCTR/include/cpu/layers/dropout_layer_cpu.hpp deleted file mode 100644 index 43977307ce..0000000000 --- a/HugeCTR/include/cpu/layers/dropout_layer_cpu.hpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace HugeCTR { - -/** - * Dropout layer which selects an arbitrary fraction of inputs to 0 - */ -template -class DropoutLayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - // Tensors weights_; It is inherited from Layer. - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * Ctor of DropoutLayer. - * @param in_tensor the input tensor - * @param out_tensor the output tensor which has the same dim with in_tensor - * @param rate fraction of the inputs set to zero., 0 < rate < 1, default = 0.5 - * @param device_id the id of GPU where this layer belongs - */ - DropoutLayerCPU(const Tensor2& in_tensor, const Tensor2& out_tensor, - const std::shared_ptr> blobs_buff, float rate); - - /** - * A method of implementing the forward pass of Dropout - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * A method of implementing the backward pass of Dropout - * @param stream CUDA stream where the backward propagation is executed - */ - void bprop() override; - - const float* mask() const { return mask_.get_ptr(); } - - private: - float rate_; - float scale_; - Tensor2 mask_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/element_wise_function_cpu.hpp b/HugeCTR/include/cpu/layers/element_wise_function_cpu.hpp deleted file mode 100644 index 15d1344ee6..0000000000 --- a/HugeCTR/include/cpu/layers/element_wise_function_cpu.hpp +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include - -namespace HugeCTR { -namespace internal { - -template -void forward_element_wise_cpu(const float* in, float* out, int len, Fop fop) { - for (int i = 0; i < len; i++) { - out[i] = fop(in[i]); - } -} - -template -void backward_element_wise_cpu(const float* h_out, float* h_in, int len, Bop bop) { - for (int i = 0; i < len; i++) { - h_in[i] = bop(h_out[i], h_in[i]); - } -} - -/** - * Common implementation for the element wise layers such as Relu and Elu. - * Their fprop/brop are just the wrapperw of forward_evaluate/backward_evaluate, - * while passing the simple scalar lambda operations to them. - * All the other element wise layers can be implemented in the similar way. - */ -class ElementWiseFunctorCPU { - public: - /** - * Ctor of ElementWiseFunctor. Copy construction and assignment are disabled. - */ - ElementWiseFunctorCPU() {} - ElementWiseFunctorCPU(const ElementWiseFunctorCPU&) = delete; - ElementWiseFunctorCPU& operator=(const ElementWiseFunctorCPU&) = delete; - - /** - * D'tor of ElementWiseFunctor. - */ - ~ElementWiseFunctorCPU() {} - - /** - * A method of implementing the element-wise forward pass - * @tparam Fop the type of simple scalar lambda operation - * @param in_tensor the input tensor - * @param out_tensor the output tensor which has the same dim with in_tensor - * @param device_id the id of GPU where this operation is handled - * @param fop Fop lambda object to do the operation per element - */ - template - void forward_evaluate(const Tensor2& in_tensor, Tensor2& out_tensor, int device_id, - Fop fop) { - const float* in = in_tensor.get_ptr(); - float* out = out_tensor.get_ptr(); - - const int len = in_tensor.get_num_elements(); - forward_element_wise_cpu(in, out, len, fop); - } - - /** - * A method of implementing the element-wise backward pass - * @tparam Bop the type of simple scalar lambda operation - * @param in_tensor the input tensor - * @param out_tensor the output tensor which has the same dim with in_tensor - * @param device_id the id of GPU where this operation is handled - * @param bop Bop lambda object to do the operation per element - */ - template - void backward_evaluate(Tensor2& in_tensor, const Tensor2& out_tensor, int device_id, - Bop bop) { - float* h_in = in_tensor.get_ptr(); - const float* h_out = out_tensor.get_ptr(); - - const int len = in_tensor.get_num_elements(); - forward_element_wise_cpu(h_out, h_in, len, bop); - } -}; - -} // namespace internal -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/elementwise_multiply_layer_cpu.hpp b/HugeCTR/include/cpu/layers/elementwise_multiply_layer_cpu.hpp deleted file mode 100644 index a2b4a52a0c..0000000000 --- a/HugeCTR/include/cpu/layers/elementwise_multiply_layer_cpu.hpp +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace HugeCTR { - -/** - * Layer which does element-wise dot product by input tensors. - * All the input tensors should have the same shape. - */ -template -class ElementwiseMultiplyLayerCPU : public LayerCPU { - public: - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - /** - * Ctor of ElementwiseMultiplyLayer. - * @param in_tensor the input tensor - * @param out_tensor the resulting output tensor - * @param device_id the id of GPU where this layer belongs - */ - ElementwiseMultiplyLayerCPU(const Tensors2& in_tensors, const Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff); - - void initialize() override; - - /** - * ElementwiseMultiplyLayer's forward propagation - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * ElementwiseMultiplyLayer's backward propagation - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; - - private: - int size_; - size_t num_; - Tensor2 h_inputs_; - bool initialized_{false}; - Tensor2 fprop_output_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/elu_layer_cpu.hpp b/HugeCTR/include/cpu/layers/elu_layer_cpu.hpp deleted file mode 100644 index 57ca367a1e..0000000000 --- a/HugeCTR/include/cpu/layers/elu_layer_cpu.hpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace HugeCTR { - -/** - * Elu activation function as a derived class of Layer - */ -template -class EluLayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - Tensors2 weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * Ctor of ReluLayer. - * @param in_tensor the input tensor - * @param out_tensor the output tensor which has the same dim with in_tensor - * @param device_id the id of GPU where this layer belongs - */ - EluLayerCPU(const Tensor2& in_tensor, const Tensor2& out_tensor, T alpha); - - /** - * A method of implementing the forward pass of Relu - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * A method of implementing the backward pass of Relu - * @param stream CUDA stream where the backward propagation is executed - */ - void bprop() override; - - private: - T alpha_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/fm_order2_layer_cpu.hpp b/HugeCTR/include/cpu/layers/fm_order2_layer_cpu.hpp deleted file mode 100644 index ecc6bd7096..0000000000 --- a/HugeCTR/include/cpu/layers/fm_order2_layer_cpu.hpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace HugeCTR { - -/** - * The order2 expression in FM formular(reference paper: - * https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf). - * The layer will be used in DeepFM model to implement the FM order2 - * computation (reference code implemented in Tensorflow: line 92~104, - * https://github.com/ChenglongChen/tensorflow-DeepFM/blob/master/DeepFM.py). - */ -template -class FmOrder2LayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - Tensors2 weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * Ctor of FmOrder2Layer. - * @param in_tensor the input tensor - * @param out_tensor the output tensor - * @param device_id the id of GPU where this layer belongs - */ - FmOrder2LayerCPU(const Tensor2& in_tensor, const Tensor2& out_tensor); - - /** - * A method of implementing the forward pass of FmOrder2 - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train); - - /** - * A method of implementing the backward pass of FmOrder2 - * @param stream CUDA stream where the backward propagation is executed - */ - void bprop(); - - private: - int batch_size_; - int slot_num_; - int embedding_vec_size_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/fully_connected_layer_cpu.hpp b/HugeCTR/include/cpu/layers/fully_connected_layer_cpu.hpp deleted file mode 100644 index 6bc6bf8798..0000000000 --- a/HugeCTR/include/cpu/layers/fully_connected_layer_cpu.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -namespace HugeCTR { - -template -class FullyConnectedLayerCPU; - -/** - * @brief - * This class implements the fully connected layer. - */ -template <> -class FullyConnectedLayerCPU : public LayerCPU { - private: - const bool use_mixed_precision_{false}; - - /* - * stores the weight tensors of this layer. - */ - // Tensors weights_; It is inherited from Layer, and named as weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - Tensors2& get_in_tensors(bool is_train) { return in_tensors_; } - - public: - /** - * forward pass - */ - void fprop(bool is_train) final; - /** - * backward pass - */ - void bprop() final; - - /** - * This is the constructor of the FullyConnectedLayer. - * It will check whether the format combination of all tensors is supported or not. - * Only two kinds of tensor formats are supported: - * (1) weight, input, output, wgrad are all in row-major. - * (2) weight, input, output, wgrad are all in column-major. - * @param weight_buff: stores the weight tensor - * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass - * @param in_tensor: stores the input tensor - * @param out_tensor: stores the output tensor - * @param weight_format: specifies the format of the weight tensor, either HW (row major) or WH - * (col-major) - */ - FullyConnectedLayerCPU(const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const Tensor2& in_tensor, const Tensor2& out_tensor, - bool use_mixed_precision); - FullyConnectedLayerCPU(const FullyConnectedLayerCPU& C) = delete; - FullyConnectedLayerCPU& operator=(const FullyConnectedLayerCPU&); -}; -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/fully_connected_layer_half_cpu.hpp b/HugeCTR/include/cpu/layers/fully_connected_layer_half_cpu.hpp deleted file mode 100644 index a264509b93..0000000000 --- a/HugeCTR/include/cpu/layers/fully_connected_layer_half_cpu.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include - -namespace HugeCTR { -/** - * @brief - * This class implements the fully connected layer. - */ -template <> -class FullyConnectedLayerCPU<__half> : public LayerCPU { - /* - * stores the weight tensors for compute of this layer. - */ - // std::vector> weights_; - Tensors2<__half> weights_half_; - - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2<__half> weights_grad_; - - /* - * stores the references to the input tensors of this layer. - */ - Tensor2<__half> bottom_tensor_; - - /* - * stores the references to the output tensors of this layer. - */ - Tensor2<__half> top_tensor_; - - /* - * stores the references to the output tensors of GEMM. - */ - Tensor2<__half> identity_tensor_; - - Tensor2<__half>& get_bottom_tensor(bool is_train) { return bottom_tensor_; } - - public: - /** - * forward pass - */ - void fprop(bool is_train) final; - /** - * backward pass - */ - void bprop() final; - - /** - * This is the constructor of the FullyConnectedLayer. - * It will check whether the format combination of all tensors is supported or not. - * Only two kinds of tensor formats are supported: - * (1) weight, input, output, wgrad are all in row-major. - * (2) weight, input, output, wgrad are all in column-major. - * @param weight_buff: stores the weight tensor - * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass - * @param bottom_tensor: stores the tensor from bottom layer - * @param top_tensor: stores the tensor to top layer - * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH - * (col-major) - */ - FullyConnectedLayerCPU(const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor); - FullyConnectedLayerCPU(const FullyConnectedLayerCPU&) = delete; - FullyConnectedLayerCPU& operator=(const FullyConnectedLayerCPU&); -}; -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/fused_fully_connected_layer_cpu.hpp b/HugeCTR/include/cpu/layers/fused_fully_connected_layer_cpu.hpp deleted file mode 100644 index d90062f199..0000000000 --- a/HugeCTR/include/cpu/layers/fused_fully_connected_layer_cpu.hpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -namespace HugeCTR { -/** - * @brief - * This class implements the fully connected layer. - */ -class FusedFullyConnectedLayerCPU : public LayerCPU { - /* - * stores the weight tensors for compute of this layer. - */ - Tensors2<__half> weights_half_; - - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2<__half> weights_grad_; - - /* - * stores the references to the bottom tensors of this layer. - */ - Tensor2<__half> bottom_tensor_; - - /* - * stores the references to the top tensors of this layer. - */ - Tensor2<__half> top_tensor_; - - /* - * stores the references to the intermediate top tensors of this layer. - */ - Tensor2<__half> middle_tensor_; - - /* - * stores the references to the intermediate bias grad tensors of this layer. - */ - Tensor2 bias_grad_tensor_; - - Tensor2<__half>& get_bottom_tensor(bool is_train) { return bottom_tensor_; } - - public: - /** - * forward pass - */ - void fprop(bool is_train) final; - /** - * backward pass - */ - void bprop() final; - - /** - * This is the constructor of the FullyConnectedLayer. - * It will check whether the format combination of all tensors is supported or not. - * Only two kinds of tensor formats are supported: - * (1) weight, input, output, wgrad are all in row-major. - * (2) weight, input, output, wgrad are all in column-major. - * @param weight_buff: stores the weight tensor - * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass - * @param bottom_tensor: stores the tensor from bottom layer - * @param top_tensor: stores the tensor to top layer - * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH - * (col-major) - */ - FusedFullyConnectedLayerCPU(const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2<__half>& bottom_tensor, - const Tensor2<__half>& top_tensor); - FusedFullyConnectedLayerCPU(const FusedFullyConnectedLayerCPU&) = delete; - FusedFullyConnectedLayerCPU& operator=(const FusedFullyConnectedLayerCPU&); -}; -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/interaction_layer_cpu.hpp b/HugeCTR/include/cpu/layers/interaction_layer_cpu.hpp deleted file mode 100644 index a63dd9ec30..0000000000 --- a/HugeCTR/include/cpu/layers/interaction_layer_cpu.hpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace HugeCTR { - -/** - * Layer which - */ -template -class InteractionLayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - Tensors2 weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - bool use_mixed_precision_; - - Tensors2 internal_tensors_; - - Tensors2& get_in_tensors(bool is_train) { return in_tensors_; } - - public: - /** - * Ctor of InteractionLayer. - * @param in_bottom_mlp_tensor the input bottom MLP tensor (batch_size, width) - * @param in_embeddings the input embeddings (batch_size, n_emb, width) - * @param out_tensor the resulting output tensor - * @param blobs_buff GeneralBuffer used to create the output tensor - * @param device_id the id of GPU where this layer belongs - */ - InteractionLayerCPU(const Tensor2& in_bottom_mlp_tensor, const Tensor2& in_embeddings, - Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff, - bool use_mixed_precision); - ~InteractionLayerCPU() override; - - /** - * Interaction's forward pass to gather data to the output tensor - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * Interaction's backward pass to scatter data to the input tensors - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/multi_cross_layer_cpu.hpp b/HugeCTR/include/cpu/layers/multi_cross_layer_cpu.hpp deleted file mode 100644 index aaf0e66917..0000000000 --- a/HugeCTR/include/cpu/layers/multi_cross_layer_cpu.hpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -namespace HugeCTR { - -class MultiCrossLayerCPU : public LayerCPU { - private: - const int num_layers_; - Tensors2 blob_tensors_; /**< vector of internal blobs' tensors */ - Tensors2 vec_tensors_; //[h,1] - - Tensor2 tmp_mat_tensors_[3]; //[h,w] - Tensor2 tmp_vec_tensor_; //[h,1] - - /* - * stores the weight tensors of this layer. - */ - // Tensors weights_; It is inherited from Layer, and named as weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * forward pass - */ - void fprop(bool is_train) final; - /** - * backward pass - */ - void bprop() final; - - MultiCrossLayerCPU(const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2& in_tensor, const Tensor2& out_tensor, - int num_layers); - MultiCrossLayerCPU(const MultiCrossLayerCPU&) = delete; - MultiCrossLayerCPU& operator=(const MultiCrossLayerCPU&) = delete; -}; -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/reduce_sum_layer_cpu.hpp b/HugeCTR/include/cpu/layers/reduce_sum_layer_cpu.hpp deleted file mode 100644 index 9ea014ce41..0000000000 --- a/HugeCTR/include/cpu/layers/reduce_sum_layer_cpu.hpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace HugeCTR { - -/** - * Layer which does reduce-sum operation by input tensor. - * The reduced axis(dimension) can be selected. The output - * tensor will keep the reduced dimension. - */ -template -class ReduceSumLayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - Tensors2 weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * Ctor of ReduceSumLayer. - * @param in_tensor the input tensor, could be 2D or 3D - * @param out_tensor the resulting output tensor - * @param axis the reduced dimension, could be 0,1,2 - * @param device_id the id of GPU where this layer belongs - */ - ReduceSumLayerCPU(const Tensor2& in_tensors, Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff, int axis); - ~ReduceSumLayerCPU(){}; - - /** - * ReduceSumLayer's forward propagation - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * ReduceSumLayer's backward propagation - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; - - private: - int axis_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/relu_layer_cpu.hpp b/HugeCTR/include/cpu/layers/relu_layer_cpu.hpp deleted file mode 100644 index c3b662b0d7..0000000000 --- a/HugeCTR/include/cpu/layers/relu_layer_cpu.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace HugeCTR { - -/** - * Relu activation function as a derived class of Layer - */ -template -class ReluLayerCPU : public LayerCPU { - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * Ctor of ReluLayer. - * @param in_tensor the input tensor - * @param out_tensor the output tensor which has the same dim with in_tensor - * @param device_id the id of GPU where this layer belongs - */ - ReluLayerCPU(const Tensor2& in_tensor, const Tensor2& out_tensor); - - /** - * A method of implementing the forward pass of Relu - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * A method of implementing the backward pass of Relu - * @param stream CUDA stream where the backward propagation is executed - */ - void bprop() override; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/reshape_layer_cpu.hpp b/HugeCTR/include/cpu/layers/reshape_layer_cpu.hpp deleted file mode 100644 index 1331ce057c..0000000000 --- a/HugeCTR/include/cpu/layers/reshape_layer_cpu.hpp +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace HugeCTR { - -/** - * Layer which reshapes a 3D/2D input tensor to 2D output tensor, - * e.g., (batch_size, n_slots, vector_size) to (batch_size, n_slots * vector_size), - * e.g., (batch_size * n_slots, vector_size) to (batch_size, n_slots * vector_size), - * If the input tensor is 3D, you can choose which slots participate by calling the different Ctor - */ -template -class ReshapeLayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - Tensors2 weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - bool in_place_; - int batch_size_; - int n_slot_; - int vector_length_; - size_t n_active_slot_; - Tensor2 selected_tensor_; - std::vector selected_; - - Tensors2& get_in_tensors(bool is_train); - - public: - /** - * General Purpose Ctor of ReshapeLayer. - * @param in_tensor the input tensor - * @param out_tensor the resulting output tensor - * @param leading_dim must be a multiple of the innermost dimesion - * e.g., leading_dim % vector_size == 0 - * and it must be able to divide the total number of elements in in_tensor - * e.g., batch_size * n_slots * vector_size % leading_dim == 0 - * @param device_id the id of GPU where this layer belongs - */ - ReshapeLayerCPU(const Tensor2& in_tensor, Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff, - size_t leading_dim); - /** - * Specialized Ctor of ReshapeLayer which assumes the 3D input tensor - * @param in_tensor the input tensor - * @param out_tensor the resulting output tensor - * @param the ID list of slots which are concatenated - * If it is empty, it is just near-zero-overhead in-place reshape from 3D to 2D. - * Otherwise, the only selected slots are concatenated in newly assigned tensor. - * @param device_id the id of GPU where this layer belongs - */ - ReshapeLayerCPU(const Tensor2& in_tensor, Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff, - std::vector& selected); - - /** - * A method of implementing the forward pass of Reshape - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * A method of implementing the forward pass of Reshape - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/sigmoid_layer_cpu.hpp b/HugeCTR/include/cpu/layers/sigmoid_layer_cpu.hpp deleted file mode 100644 index b3b6e01ef7..0000000000 --- a/HugeCTR/include/cpu/layers/sigmoid_layer_cpu.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace HugeCTR { - -/** - * Sigmoid activation function as a derived class of Layer - */ -template -class SigmoidLayerCPU : public LayerCPU { - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * Ctor of SigmoidLayer. - * @param in_tensor the input tensor - * @param out_tensor the output tensor which has the same dim with in_tensor - * @param device_id the id of GPU where this layer belongs - */ - SigmoidLayerCPU(const Tensor2& in_tensor, const Tensor2& out_tensor); - - /** - * A method of implementing the forward pass of Sigmoid - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * A method of implementing the backward pass of Sigmoid - * @param stream CUDA stream where the backward propagation is executed - */ - void bprop() override; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/slice_layer_cpu.hpp b/HugeCTR/include/cpu/layers/slice_layer_cpu.hpp deleted file mode 100644 index 94ef7c0b79..0000000000 --- a/HugeCTR/include/cpu/layers/slice_layer_cpu.hpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -namespace HugeCTR { - -/** - * Layer which splits a single 2D input tensor into multiple 2D output tensors across columns. - * e.g., (batch_size, 90) to (batch_size, 40) and (batch_size, 4) by choosing the column ranges - * [0:40) and (50:90). It is possible those ranges overlap, e.g., [0:100) and [50:200). - */ -template -class SliceLayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - Tensors2 weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - int virt_w_; - std::vector sts_; - - std::vector> ranges_; - - Tensors2& get_in_tensors(bool is_train) { return in_tensors_; } - - public: - /** - * Ctor of SliceLayer. - * @param in_tensor input tensor - * @param out_tensor vector where the pointers to the created output tensors are stored - * @param blobs_buff GeneralBuffer used to create the output tensor - * @param ranges set of the slice ranges along columns - * @param device_id the id of GPU where this layer belongs - */ - SliceLayerCPU(const Tensor2& in_tensor, Tensors2& out_tensors, - const std::shared_ptr>& blobs_buff, - std::vector>& ranges); - ~SliceLayerCPU() override{}; - - /** - * Slice's forward pass to gather data to the output tensor - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * Slice's backward pass to scatter data to the input tensors - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/layers/weight_multiply_layer_cpu.hpp b/HugeCTR/include/cpu/layers/weight_multiply_layer_cpu.hpp deleted file mode 100644 index e2275099e2..0000000000 --- a/HugeCTR/include/cpu/layers/weight_multiply_layer_cpu.hpp +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace HugeCTR { - -/** - * Layer which does element-wise product by input tensor X and weight W. - * The input tensor X has dimension: [batch_size, slot_num], while - * the input weight W has dimension: [slot_num, embedding_vec_size]. - * The WeightMultiplyLayer will broadcast the value of W to "batch_size" dim - * and broadcast the value of X to embedding_vec_size dim automatically - * when doing element-wise product with X. So, the output tensor has - * the dimension: [batch_size, slot_num*embedding_vec_size]. - */ -template -class WeightMultiplyLayerCPU : public LayerCPU { - /* - * stores the weight tensors of this layer. - */ - Tensors2 weights_; - /* - * stores the weight gradient tensors of this layer. - */ - Tensors2 wgrad_; - /* - * stores the references to the input tensors of this layer. - */ - Tensors2 in_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * Ctor of WeightMultiplyLayer. - * @param in_tensor the input tensor - * @param out_tensor the resulting output tensor - * @param device_id the id of GPU where this layer belongs - */ - WeightMultiplyLayerCPU(const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blob_buff, - const Tensor2& in_tensor, Tensor2& out_tensor, - const std::vector& weight_dims); - ~WeightMultiplyLayerCPU() override{}; - - /** - * WeightMultiplyLayer's forward propagation to do element-wise production - * @param stream CUDA stream where the forward propagation is executed - */ - void fprop(bool is_train) override; - /** - * WeightMultiplyLayer's backward propagation - * @param stream CUDA stream where the forward propagation is executed - */ - void bprop() override; - - private: - size_t batch_size_; - size_t slot_num_; - size_t embedding_vec_size_; - Tensor2 wgrad_tmp_trans_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/cpu/network_cpu.hpp b/HugeCTR/include/cpu/network_cpu.hpp deleted file mode 100644 index 142b50f59a..0000000000 --- a/HugeCTR/include/cpu/network_cpu.hpp +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -/** - * @brief Dense network (embedding is not included) - * - * Each GPU (device) has an instance of Network. Network performs - * forward/backward/loss/update of the dense layers. - */ -class NetworkCPU { - private: - std::vector> layers_; /**< vector of layers */ - - Tensor2 weight_tensor_; - Tensor2 wgrad_tensor_; - Tensor2<__half> weight_tensor_half_; - Tensor2<__half> wgrad_tensor_half_; - - Tensor2 pred_tensor_; - - std::shared_ptr cpu_resource_; - // std::shared_ptr gpu_resource_; /**< gpu resource */ - - bool use_mixed_precision_; - // bool enable_cuda_graph_; - - // bool predict_graph_created_; - // bool eval_graph_created_; - // bool train_fprop_graph_created_; - // bool train_bprop_graph_created_; - // cudaGraph_t predict_graph_; - // cudaGraph_t eval_graph_; - // cudaGraph_t train_fprop_graph_; - // cudaGraph_t train_bprop_graph_; - // cudaGraphExec_t predict_instance_; - // cudaGraphExec_t eval_instance_; - // cudaGraphExec_t train_fprop_instance_; - // cudaGraphExec_t train_bprop_instance_; - - void conv_weight_(Tensor2<__half>& target, const Tensor2& source); - - public: - /** - * Ctor. - * @param device_id device id. - * @param gpu_resource gpu resource for local gpu. - * @param disable_parser only for unit test. - */ - NetworkCPU(const std::shared_ptr& cpu_resource, bool use_mixed_precision = false); - NetworkCPU(const NetworkCPU&) = delete; - NetworkCPU& operator=(const NetworkCPU&) = delete; - - /** - * Forward only for inference. - */ - void predict(); - - /** - * Get the pred tensor for inference. - */ - Tensor2 get_pred_tensor() { return pred_tensor_; } - - /** - * Get number of parameters in this network. - */ - size_t get_params_num() const { return weight_tensor_.get_num_elements(); } - - /** - * Read parameters from model_file. - */ - void load_params_from_model(const std::string& model_file); - - /** - * initialize layer by layer - */ - void initialize(); - - /** - * factory method to create network - */ - static NetworkCPU* create_network(const nlohmann::json& j_array, - std::vector& tensor_entries, - const std::shared_ptr& cpu_resource, - bool use_mixed_precision); -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/inference/embedding_feature_combiner.hpp b/HugeCTR/include/inference/embedding_feature_combiner.hpp deleted file mode 100644 index b51a888767..0000000000 --- a/HugeCTR/include/inference/embedding_feature_combiner.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include - -namespace HugeCTR { - -enum class EmbeddingFeatureCombiner_t { Sum, Mean }; - -/** - * Combine the embedding feature vectors by Sum or Mean - * according to slot_num and row_ptrs - */ -template -class EmbeddingFeatureCombiner : public Layer { - /* - * stores the input tensors of this layer. - */ - std::vector> in_tensors_; - /* - * stores the row pointers tensors of this layer. - */ - std::vector> row_ptrs_tensors_; - /* - * stores the references to the output tensors of this layer. - */ - Tensors2 out_tensors_; - - public: - /** - * Ctor of EmbeddingFeatureCombiner. - * @param in_tensor the embedding feature tensor, must be 2D - * @param row_ptrs_tensor row pointers tensor, should be 1D (batch_size*slot_num+1,), which - * indicate which adjacent vectors belong to the same slot (i.e., feature field) - * @param out_tensor the resulting output tensor, should be 3D (batch_size, slot_num, - * embedding_vec_size) - * @param batch_size batch size - * @param slot_num slot number - * @param combiner_type combiner type for the features in the same slot, Sum or Mean - * @param blobs_buff GeneralBuffer used to create the output tensor - * @param gpu_resource available gpu resource - */ - EmbeddingFeatureCombiner(const std::shared_ptr& in_tensor, - const std::shared_ptr& row_ptrs_tensor, - Tensor2& out_tensor, int batch_size, int slot_num, - EmbeddingFeatureCombiner_t combiner_type, - const std::shared_ptr>& blobs_buff, - const std::shared_ptr& gpu_resource); - ~EmbeddingFeatureCombiner(){}; - - /** - * EmbeddingFeatureCombiner's combine operation - */ - void fprop(bool is_train = false) override; - - void bprop() override { - HCTR_OWN_THROW(Error_t::IllegalCall, - "The bprop() of EmbeddingFeatureCombiner is not implemented!"); - } - - private: - int batch_size_; - int slot_num_; - int embedding_vec_size_; - EmbeddingFeatureCombiner_t combiner_type_; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/inference/inference_session.hpp b/HugeCTR/include/inference/inference_session.hpp deleted file mode 100644 index adfa1558c9..0000000000 --- a/HugeCTR/include/inference/inference_session.hpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -class InferenceSession : public InferenceSessionBase { - private: - nlohmann::json config_; // should be declared before parser_ and inference_parser_ - std::vector embedding_table_slot_size_; - std::vector streams_; - - // std::vector>> row_ptrs_tensors_; // embedding input row - // std::vector>> embedding_features_tensors_; // embedding input - // value vector - core23::Tensor dense_input_tensorbag_; // dense input vector - std::vector> row_ptrs_tensors_; // embedding input row - std::vector> - embedding_features_tensors_; // embedding input value vector - core23::Tensor dense_input_tensor_; // dense input vector - std::vector inference_tensor_entries_; // tensor entries in the inference pipeline - - std::vector> embedding_feature_combiners_; - std::unique_ptr network_; - std::shared_ptr embedding_cache_; - - int* h_row_ptrs_; - void* h_keys_; - - int* d_row_ptrs_; - void* d_keys_; - float* d_embedding_vectors_; - - Pipeline predict_network_pipeline_; - - void predict_impl(float* d_dense, void* keys, bool key_on_device, int* d_row_ptrs, - float* d_output, int num_samples, int num_embedding_tables, - bool table_major_key_layout); - - protected: - InferenceParser inference_parser_; - InferenceParams inference_params_; - std::shared_ptr data_reader_; - std::shared_ptr resource_manager_; - - public: - virtual ~InferenceSession(); - InferenceSession(const std::string& model_config_path, const InferenceParams& inference_params, - const std::shared_ptr& embedding_cache, - std::shared_ptr resource_manager = nullptr); - InferenceSession(InferenceSession const&) = delete; - InferenceSession& operator=(InferenceSession const&) = delete; - - virtual void predict(float* d_dense, void* h_embeddingcolumns, int* d_row_ptrs, float* d_output, - int num_samples, bool table_major_key_layout = false); - - virtual void predict_from_device(float* d_dense, void* d_embeddingcolumns, int* d_row_ptrs, - float* d_output, int num_samples, - bool table_major_key_layout = false); - - const InferenceParser& get_inference_parser() const { return inference_parser_; } - const std::vector& get_inference_tensor_entries() const { - return inference_tensor_entries_; - } -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/inference/inference_session_base.hpp b/HugeCTR/include/inference/inference_session_base.hpp deleted file mode 100644 index 6fb3113ae2..0000000000 --- a/HugeCTR/include/inference/inference_session_base.hpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -namespace HugeCTR { - -class InferenceSessionBase { - public: - virtual ~InferenceSessionBase() = 0; - InferenceSessionBase() = default; - InferenceSessionBase(InferenceSessionBase const&) = delete; - InferenceSessionBase& operator=(InferenceSessionBase const&) = delete; - - virtual void predict(float* d_dense, void* h_embeddingcolumns, int* d_row_ptrs, float* d_output, - int num_samples, bool table_major_key_layout = false) = 0; - - virtual void predict_from_device(float* d_dense, void* d_embeddingcolumns, int* d_row_ptrs, - float* d_output, int num_samples, - bool table_major_key_layout = false) = 0; - - static std::shared_ptr create( - const std::string& model_config_path, const InferenceParams& inference_params, - const std::shared_ptr& embedding_cache); -}; - -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/include/network.hpp b/HugeCTR/include/network.hpp index 0cdb90a78c..c7b3dcdc78 100644 --- a/HugeCTR/include/network.hpp +++ b/HugeCTR/include/network.hpp @@ -160,17 +160,6 @@ class Network final { */ void upload_params_to_device(const std::string& model_file); - /** - * Read parameters from model_file. - */ - void upload_params_to_device_inference(const std::string& model_file); - - /** - * Read non-trainable parameters from model_file, e.g., running mean and running variable for - * BatchNorm - */ - void upload_non_trainable_params_to_device_inference(const std::string& model_file); - /** * Writing parameters to cpu buffer. */ @@ -224,20 +213,6 @@ class Network final { */ void search_algorithm(); - /** - * factory method to create network - */ - static Network* create_network(const nlohmann::json& j_array, const nlohmann::json& j_optimizer, - std::vector& train_tensor_entries, - std::vector& evaluate_tensor_entries, - int num_networks_in_global, - std::shared_ptr& exchange_wgrad, - const std::shared_ptr& cpu_resource, - const std::shared_ptr& gpu_resource, - bool use_mixed_precision, bool enable_tf32_compute, float scaler, - bool use_algorithm_search, bool inference_flag, - bool grouped_all_reduce); - /** * add layer to network, python interface use only */ diff --git a/HugeCTR/include/parser.hpp b/HugeCTR/include/parser.hpp index c3b2e20950..e507d7d14d 100644 --- a/HugeCTR/include/parser.hpp +++ b/HugeCTR/include/parser.hpp @@ -101,54 +101,6 @@ struct Solver { Solver() {} }; -class InferenceParser { - private: - nlohmann::json config_; /**< configure file. */ - std::map tensor_active_; /**< whether a tensor is active. */ - public: - std::string label_name; - std::string dense_name; - std::vector sparse_names; - size_t label_dim; /**< dense feature dimension */ - size_t dense_dim; /**< dense feature dimension */ - size_t slot_num; /**< total slot number */ - size_t num_embedding_tables; /**< number of embedding tables */ - std::vector slot_num_for_tables; /**< slot_num for each embedding table */ - std::vector max_nnz_for_tables; /**< max nnz for each embedding table*/ - std::vector - max_feature_num_for_tables; /**< max feature number of each embedding table */ - std::vector - embed_vec_size_for_tables; /**< embedding vector size for each embedding table */ - size_t max_feature_num_per_sample; /**< max feature number per table */ - size_t max_embedding_vector_size_per_sample; /**< max embedding vector size per sample */ - - template - void create_pipeline_inference(const InferenceParams& inference_params, - core23::Tensor& dense_input_bag, - std::vector>& rows, - std::vector>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embedding, Network** network, - std::vector& inference_tensor_entries, - const std::shared_ptr resource_manager); - /** - * Ctor. - * Ctor only verify the configure file, doesn't create pipeline. - */ - InferenceParser(const nlohmann::json& config); - - /** - * Create inference pipeline, which only creates network and embedding - */ - void create_pipeline(const InferenceParams& inference_params, core23::Tensor& dense_input_bag, - std::vector>& row, - std::vector>& embeddingvec, - std::vector& embedding_table_slot_size, - std::vector>* embedding, Network** network, - std::vector& inference_tensor_entries, - const std::shared_ptr resource_manager); -}; - std::unique_ptr get_learning_rate_scheduler( const std::string configure_file); @@ -429,42 +381,6 @@ inline void check_graph(std::map& tensor_active, } } -template -struct create_embedding { - void operator()(const InferenceParams& inference_params, const nlohmann::json& j_layers_array, - std::vector>& rows, - std::vector>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector* tensor_entries, - std::vector>* embeddings, - const std::shared_ptr gpu_resource, - std::shared_ptr>& blobs_buff); -}; - -template -struct create_datareader { - // Used by InferenceSession - void operator()(const InferenceParams& inference_params, const InferenceParser& inference_parser, - std::shared_ptr& data_reader, - const std::shared_ptr resource_manager, - std::map>& sparse_input_map, - std::map& label_dense_map, const std::string& source, - const DataReaderType_t data_reader_type, const Check_t check_type, - const std::vector& slot_size_array, const bool repeat_dataset, - long long num_samples, const DataSourceParams& data_source_params); - // Used by InferenceModel - void operator()(const InferenceParams& inference_params, const InferenceParser& inference_parser, - std::shared_ptr& data_reader, - const std::shared_ptr resource_manager, - std::map>& sparse_input_map, - std::vector& label_tensor_list, - std::vector& dense_tensor_list, const std::string& source, - const DataReaderType_t data_reader_type, const Check_t check_type, - const std::vector& slot_size_array, const bool repeat_dataset, - const DataSourceParams& data_source_params, - bool reading_file_sequentially = false); -}; - inline int get_max_feature_num_per_sample_from_nnz_per_slot(const nlohmann::json& j) { int max_feature_num_per_sample = 0; auto slot_num = get_value_from_json(j, "slot_num"); diff --git a/HugeCTR/include/pybind/hps_wrapper.hpp b/HugeCTR/include/pybind/hps_wrapper.hpp index f16356a1e1..ff2e9dcf22 100644 --- a/HugeCTR/include/pybind/hps_wrapper.hpp +++ b/HugeCTR/include/pybind/hps_wrapper.hpp @@ -276,6 +276,134 @@ pybind11::array_t HPS::lookup(pybind11::array_t& h_keys, void HPSPybind(pybind11::module& m) { pybind11::module infer = m.def_submodule("inference", "inference submodule of hugectr"); + pybind11::class_>(infer, + "VolatileDatabaseParams") + .def( + pybind11::init&>(), + pybind11::arg("type") = DatabaseType_t::ParallelHashMap, + // Backend specific. + pybind11::arg("address") = "127.0.0.1:7000", pybind11::arg("user_name") = "default", + pybind11::arg("password") = "", + pybind11::arg("num_partitions") = std::min(16u, std::thread::hardware_concurrency()), + pybind11::arg("allocation_rate") = 256L * 1024L * 1024L, + pybind11::arg("shared_memory_size") = 16L * 1024L * 1024L * 1024L, + pybind11::arg("shared_memory_name") = "hctr_mp_hash_map_database", + pybind11::arg("shared_memory_auto_remove") = true, + pybind11::arg("num_node_connections") = 5, pybind11::arg("max_batch_size") = 64L * 1024L, + pybind11::arg("enable_tls") = false, + pybind11::arg("tls_ca_certificate") = "cacertbundle.crt", + pybind11::arg("tls_client_certificate") = "client_cert.pem", + pybind11::arg("tls_client_key") = "client_key.pem", + pybind11::arg("tls_server_name_identification") = "redis.localhost", + // Overflow handling related. + pybind11::arg("overflow_margin") = std::numeric_limits::max(), + pybind11::arg("overflow_policy") = DatabaseOverflowPolicy_t::EvictRandom, + pybind11::arg("overflow_resolution_target") = 0.8, + // Caching behavior related. + pybind11::arg("initialize_after_startup") = true, + pybind11::arg("initial_cache_rate") = 1.0, + pybind11::arg("cache_missed_embeddings") = false, + // Real-time update mechanism related. + pybind11::arg("update_filters") = std::vector{"^hps_.+$"}); + + pybind11::class_>(infer, + "PersistentDatabaseParams") + .def(pybind11::init&>(), + pybind11::arg("backend") = DatabaseType_t::Disabled, + // Backend specific. + pybind11::arg("path") = (std::filesystem::temp_directory_path() / "rocksdb").string(), + pybind11::arg("num_threads") = 16, pybind11::arg("read_only") = false, + pybind11::arg("max_batch_size") = 64L * 1024L, + // Caching behavior related. + pybind11::arg("initialize_after_startup") = true, + // Real-time update mechanism related. + pybind11::arg("update_filters") = std::vector{"^hps_.+$"}); + + pybind11::class_>( + infer, "UpdateSourceParams") + .def(pybind11::init(), + pybind11::arg("type") = UpdateSourceType_t::Null, + // Backend specific. + pybind11::arg("brokers") = "127.0.0.1:9092", + pybind11::arg("metadata_refresh_interval_ms") = 30'000, + pybind11::arg("receive_buffer_size") = 256 * 1024, + pybind11::arg("poll_timeout_ms") = 500, pybind11::arg("max_batch_size") = 8 * 1024, + pybind11::arg("failure_backoff_ms") = 50, pybind11::arg("max_commit_interval") = 32); + + pybind11::enum_(infer, "EmbeddingCacheType_t") + .value("Dynamic", EmbeddingCacheType_t::Dynamic) + .value("UVM", EmbeddingCacheType_t::UVM) + .value("Static", EmbeddingCacheType_t::Static) + .value(hctr_enum_to_c_str(EmbeddingCacheType_t::Stochastic), EmbeddingCacheType_t::Stochastic) + .export_values(); + + pybind11::class_>( + infer, "InferenceParams") + .def(pybind11::init&, const int, const bool, const float, + const bool, const bool, const float, const bool, const bool, + // HugeCTR::DATABASE_TYPE, const std::string&, const std::string&, + // const float, + const int, const int, const int, const float, const std::vector&, + const std::vector&, const VolatileDatabaseParams&, + const PersistentDatabaseParams&, const UpdateSourceParams&, const int, + const float, const float, const std::vector&, + const std::vector&, const std::vector&, + const std::string&, const size_t, const size_t, const std::string&, bool, + const EmbeddingCacheType_t&, bool, bool, bool, bool, bool, bool>(), + + pybind11::arg("model_name"), pybind11::arg("max_batchsize"), + pybind11::arg("hit_rate_threshold"), pybind11::arg("dense_model_file"), + pybind11::arg("sparse_model_files"), pybind11::arg("device_id") = 0, + pybind11::arg("use_gpu_embedding_cache"), pybind11::arg("cache_size_percentage"), + pybind11::arg("i64_input_key"), pybind11::arg("use_mixed_precision") = false, + pybind11::arg("scaler") = 1.0, pybind11::arg("use_algorithm_search") = true, + pybind11::arg("use_cuda_graph") = true, + pybind11::arg("number_of_worker_buffers_in_pool") = 2, + pybind11::arg("number_of_refresh_buffers_in_pool") = 1, + pybind11::arg("thread_pool_size") = 16, + pybind11::arg("cache_refresh_percentage_per_iteration") = 0.0, + pybind11::arg("deployed_devices") = std::vector{0}, + pybind11::arg("default_value_for_each_table") = std::vector{0.0f}, + // Database backend. + pybind11::arg("volatile_db") = VolatileDatabaseParams{}, + pybind11::arg("persistent_db") = PersistentDatabaseParams{}, + pybind11::arg("update_source") = UpdateSourceParams{}, + // HPS required + pybind11::arg("maxnum_des_feature_per_sample") = 26, + pybind11::arg("refresh_delay") = 0.0f, pybind11::arg("refresh_interval") = 0.0f, + pybind11::arg("maxnum_catfeature_query_per_table_per_sample") = std::vector{26}, + pybind11::arg("embedding_vecsize_per_table") = std::vector{128}, + pybind11::arg("embedding_table_names") = std::vector{""}, + pybind11::arg("network_file") = "", pybind11::arg("label_dim") = 1, + pybind11::arg("slot_num") = 10, pybind11::arg("non_trainable_params_file") = "", + pybind11::arg("use_static_table") = false, + pybind11::arg("embedding_cache_type") = EmbeddingCacheType_t::Dynamic, + pybind11::arg("use_context_stream") = true, + pybind11::arg("fuse_embedding_table") = false, + pybind11::arg("use_hctr_cache_implementation") = true, pybind11::arg("init_ec") = true, + pybind11::arg("enable_pagelock") = false, pybind11::arg("fp8_quant") = false); + pybind11::class_>(infer, "ParameterServerConfig") diff --git a/HugeCTR/include/pybind/inference_model.hpp b/HugeCTR/include/pybind/inference_model.hpp deleted file mode 100644 index 7add09634b..0000000000 --- a/HugeCTR/include/pybind/inference_model.hpp +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include - -namespace HugeCTR { - -class InferenceModel { - public: - virtual ~InferenceModel(); - InferenceModel(const std::string& model_config_path, const InferenceParams& inference_params); - InferenceModel(const InferenceModel&) = delete; - InferenceModel& operator=(const InferenceModel&) = delete; - - float evaluate(size_t num_batches, const std::string& source, DataReaderType_t data_reader_type, - Check_t check_type, const std::vector& slot_size_array, - const DataSourceParams& data_source_params, bool reading_file_seq = true); - - void predict(float* pred_output, size_t num_batches, const std::string& source, - DataReaderType_t data_reader_type, Check_t check_type, - const std::vector& slot_size_array, - const DataSourceParams& data_source_params, bool reading_file_seq = true); - - std::tuple, int> get_tensor_info_by_name( - const std::string& tensor_name); - - void check_out_tensor(int index, float* global_result); - - const InferenceParams& get_inference_params() const { return inference_params_; } - - const InferenceParser& get_inference_parser() const { return inference_parser_; } - - private: - InferenceParams inference_params_; - InferenceParser inference_parser_; - std::shared_ptr resource_manager_; - - std::vector> inference_sessions_; - std::shared_ptr parameter_server_; - metrics::Metrics metrics_; - - std::shared_ptr data_reader_; - std::vector pred_tensor_list_; // the length equals local_gpu_count - std::vector key_tensor_list_; // the length equals local_gpu_count - std::vector rowoffset_tensor_list_; // the length equals local_gpu_count - - std::vector>> - old_pred_tensor_list_; // the length equals local_gpu_count - std::vector reader_label_tensor_list_; // the length equals local_gpu_count - std::vector reader_dense_tensor_list_; // the length equals local_gpu_count - std::map> sparse_input_map_64_; - std::map> sparse_input_map_32_; - - std::vector> inference_tensor_entries_list_; - - std::vector raw_metrics_map_list_; // the length equals local_gpu_count - std::shared_ptr metric_; // currently only support AUC during inference - - const long long global_max_batch_size_; - long long current_batch_size_{0}; - - Timer timer_infer; - Timer timer_reader; - Timer timer_forward; - - void reset_reader_tensor_list(); - - template - void parse_input_from_data_reader( - const std::map>& sparse_input_map, - std::vector& key_tensor_list, - std::vector& rowoffset_tensor_list); -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/pybind/inference_wrapper.hpp b/HugeCTR/include/pybind/inference_wrapper.hpp deleted file mode 100644 index a9051fcf0d..0000000000 --- a/HugeCTR/include/pybind/inference_wrapper.hpp +++ /dev/null @@ -1,661 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace python_lib { - -/** - * @brief Main InferenceSessionPy class - * - * This is a class supporting HugeCTR inference in Python, which includes predict32 and - predict64. - * To support dynamic batch size during inference, this class need to be modified in the future. - */ -class InferenceSessionPy : public InferenceSession { - public: - InferenceSessionPy(const std::string& model_config_path, const InferenceParams& inference_params, - const std::shared_ptr& embedding_cache); - InferenceSessionPy(const std::string& model_config_path, const InferenceParams& inference_params, - const std::shared_ptr& embedding_cache, - const std::shared_ptr& parameter_server); - ~InferenceSessionPy(); - - float evaluate(size_t num_batches, const std::string& source, DataReaderType_t data_reader_type, - Check_t check_type, const std::vector& slot_size_array, - long long num_samples, const DataSourceParams& data_source_params); - pybind11::array_t predict(size_t num_batches, const std::string& source, - DataReaderType_t data_reader_type, Check_t check_type, - const std::vector& slot_size_array, - long long num_samples, - const DataSourceParams& data_source_params); - std::vector& predict(const std::vector& dense, - const std::vector& embeddingcolumns, - const std::vector& row_ptrs); - void refresh_embedding_cache(); - - private: - void initialize(); - - template - void load_data(const std::string& source, DataReaderType_t data_reader_type, Check_t check_type, - const std::vector& slot_size_array, long long num_samples, - const DataSourceParams& data_source_params); - - template - float evaluate_(size_t num_batches, const std::string& source, DataReaderType_t data_reader_type, - Check_t check_type, const std::vector& slot_size_array, - long long num_samples, const DataSourceParams& data_source_params); - - template - pybind11::array_t predict_(size_t num_batches, const std::string& source, - DataReaderType_t data_reader_type, Check_t check_type, - const std::vector& slot_size_array, - long long num_samples, - const DataSourceParams& data_source_params); - - template - void predict_(const std::vector& dense, const std::vector& embeddingcolumns, - const std::vector& row_ptrs); - - std::vector embeddingcolumns_u32_; - std::vector output_; - - // Allocate in this class - float* d_dense_; - void* h_embeddingcolumns_; - int* d_row_ptrs_; - float* d_output_; - - // Derived from data reader output tensor - float* d_reader_dense_; - std::vector d_reader_keys_list_; - std::vector d_reader_row_ptrs_list_; - core23::Tensor label_tensor_; - - // Parameter Server - std::shared_ptr parameter_server_; -}; - -void InferenceSessionPy::initialize() { - HCTR_LOG( - WARNING, ROOT, - "InferenceSession is not suitable for multi-GPU offline inference. Please use " - "InferenceModel: " - "https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#inferencemodel\n"); - CudaDeviceContext context(resource_manager_->get_local_gpu(0)->get_device_id()); - HCTR_LIB_THROW(cudaMalloc((void**)&d_dense_, inference_params_.max_batchsize * - inference_parser_.dense_dim * sizeof(float))); - HCTR_LIB_THROW(cudaMalloc((void**)&d_row_ptrs_, - (inference_params_.max_batchsize * inference_parser_.slot_num + - inference_parser_.num_embedding_tables) * - sizeof(int))); - HCTR_LIB_THROW(cudaMalloc((void**)&d_output_, inference_params_.max_batchsize * - inference_parser_.label_dim * sizeof(float))); - if (inference_params_.i64_input_key) { - HCTR_LIB_THROW(cudaHostAlloc((void**)&h_embeddingcolumns_, - inference_params_.max_batchsize * - inference_parser_.max_feature_num_per_sample * - sizeof(long long), - cudaHostAllocPortable)); - } else { - HCTR_LIB_THROW(cudaHostAlloc((void**)&h_embeddingcolumns_, - inference_params_.max_batchsize * - inference_parser_.max_feature_num_per_sample * - sizeof(unsigned int), - cudaHostAllocPortable)); - } -} - -InferenceSessionPy::InferenceSessionPy(const std::string& model_config_path, - const InferenceParams& inference_params, - const std::shared_ptr& embedding_cache) - : InferenceSession(model_config_path, inference_params, embedding_cache) { - initialize(); -} - -InferenceSessionPy::InferenceSessionPy( - const std::string& model_config_path, const InferenceParams& inference_params, - const std::shared_ptr& embedding_cache, - const std::shared_ptr& parameter_server) - : InferenceSession(model_config_path, inference_params, embedding_cache), - parameter_server_(parameter_server) { - initialize(); -} - -InferenceSessionPy::~InferenceSessionPy() { - CudaDeviceContext context(resource_manager_->get_local_gpu(0)->get_device_id()); - cudaFree(d_dense_); - cudaFreeHost(h_embeddingcolumns_); - cudaFree(d_row_ptrs_); - cudaFree(d_output_); -} - -template -void InferenceSessionPy::predict_(const std::vector& dense, - const std::vector& embeddingcolumns, - const std::vector& row_ptrs) { - if (inference_parser_.slot_num == 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "The number of slots should not be zero"); - } - size_t num_samples = - (row_ptrs.size() - inference_parser_.num_embedding_tables) / inference_parser_.slot_num; - if (num_samples > inference_params_.max_batchsize) { - HCTR_OWN_THROW(Error_t::WrongInput, "The number of samples should not exceed max_batchsize"); - } - if (num_samples * inference_parser_.dense_dim != dense.size()) { - HCTR_OWN_THROW(Error_t::WrongInput, "The dimension of dense features is not consistent"); - } - if (num_samples * inference_parser_.slot_num + inference_parser_.num_embedding_tables != - row_ptrs.size()) { - HCTR_OWN_THROW(Error_t::WrongInput, "The dimension of row pointers is not consistent"); - } - if (num_samples * inference_parser_.max_feature_num_per_sample < embeddingcolumns.size()) { - HCTR_OWN_THROW( - Error_t::WrongInput, - "The dimension of embedding keys is greater than num_samples*max_feature_num_per_sample"); - } - size_t num_embeddingcolumns = 0; - size_t row_ptr_offset = 0; - for (int j = 0; j < static_cast(inference_parser_.num_embedding_tables); j++) { - num_embeddingcolumns += - row_ptrs[num_samples * inference_parser_.slot_num_for_tables[j] + row_ptr_offset]; - row_ptr_offset += num_samples * inference_parser_.slot_num_for_tables[j] + 1; - } - if (embeddingcolumns.size() != num_embeddingcolumns) { - HCTR_OWN_THROW(Error_t::WrongInput, - "The dimension of embedding keys is not consistent with row pointers"); - } - CudaDeviceContext context(resource_manager_->get_local_gpu(0)->get_device_id()); - output_.resize(num_samples); - size_t num_keys = embeddingcolumns.size(); - HCTR_LIB_THROW(cudaMemcpyAsync( - d_dense_, dense.data(), num_samples * inference_parser_.dense_dim * sizeof(float), - cudaMemcpyHostToDevice, resource_manager_->get_local_gpu(0)->get_stream())); - HCTR_LIB_THROW(cudaMemcpyAsync( - d_row_ptrs_, row_ptrs.data(), - (num_samples * inference_parser_.slot_num + inference_parser_.num_embedding_tables) * - sizeof(int), - cudaMemcpyHostToDevice, resource_manager_->get_local_gpu(0)->get_stream())); - memcpy(h_embeddingcolumns_, embeddingcolumns.data(), num_keys * sizeof(TypeKey)); - InferenceSession::predict(d_dense_, h_embeddingcolumns_, d_row_ptrs_, d_output_, - static_cast(num_samples)); - HCTR_LIB_THROW(cudaMemcpyAsync( - output_.data(), d_output_, num_samples * inference_parser_.label_dim * sizeof(float), - cudaMemcpyDeviceToHost, resource_manager_->get_local_gpu(0)->get_stream())); - HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(0)->get_stream())); -} - -std::vector& InferenceSessionPy::predict(const std::vector& dense, - const std::vector& embeddingcolumns, - const std::vector& row_ptrs) { - if (inference_params_.i64_input_key) { - predict_(dense, embeddingcolumns, row_ptrs); - } else { - std::vector().swap(embeddingcolumns_u32_); - std::transform(embeddingcolumns.begin(), embeddingcolumns.end(), - std::back_inserter(embeddingcolumns_u32_), - [](const long long& v) -> unsigned int { return static_cast(v); }); - predict_(dense, embeddingcolumns_u32_, row_ptrs); - } - return output_; -} - -template -void InferenceSessionPy::load_data(const std::string& source, - const DataReaderType_t data_reader_type, - const Check_t check_type, - const std::vector& slot_size_array, - const long long num_samples, - const DataSourceParams& data_source_params) { - CudaDeviceContext context(resource_manager_->get_local_gpu(0)->get_device_id()); - bool repeat_dataset = true; - std::map> sparse_input_map; - std::map label_dense_map; - // force the data reader to not use mixed precision - create_datareader()(inference_params_, inference_parser_, data_reader_, - resource_manager_, sparse_input_map, label_dense_map, source, - data_reader_type, check_type, slot_size_array, repeat_dataset, - num_samples, data_source_params); - if (data_reader_->is_started() == false) { - HCTR_OWN_THROW(Error_t::IllegalCall, "Start the data reader first before evaluation"); - } - core23::Tensor dense_tensor; - if (!find_item_in_map(label_tensor_, inference_parser_.label_name, label_dense_map)) { - HCTR_OWN_THROW(Error_t::WrongInput, "Cannot find " + inference_parser_.label_name); - } - if (!find_item_in_map(dense_tensor, inference_parser_.dense_name, label_dense_map)) { - HCTR_OWN_THROW(Error_t::WrongInput, "Cannot find " + inference_parser_.dense_name); - } - d_reader_dense_ = reinterpret_cast(dense_tensor.data()); - d_reader_keys_list_.clear(); - d_reader_row_ptrs_list_.clear(); - for (size_t i = 0; i < inference_parser_.num_embedding_tables; i++) { - core23_reader::SparseInput sparse_input; - if (!find_item_in_map(sparse_input, inference_parser_.sparse_names[i], sparse_input_map)) { - HCTR_OWN_THROW(Error_t::WrongInput, "Cannot find " + inference_parser_.sparse_names[i]); - } - d_reader_keys_list_.push_back( - reinterpret_cast(sparse_input.evaluate_sparse_tensors[0].get_value_ptr())); - d_reader_row_ptrs_list_.push_back( - reinterpret_cast(sparse_input.evaluate_sparse_tensors[0].get_rowoffset_ptr())); - } -} - -template -float InferenceSessionPy::evaluate_(const size_t num_batches, const std::string& source, - const DataReaderType_t data_reader_type, - const Check_t check_type, - const std::vector& slot_size_array, - const long long num_samples, - const DataSourceParams& data_source_params) { - CudaDeviceContext context(resource_manager_->get_local_gpu(0)->get_device_id()); - load_data(source, data_reader_type, check_type, slot_size_array, num_samples, - data_source_params); - std::vector keys_elements_list(inference_parser_.num_embedding_tables); - std::vector row_ptr_elements_list(inference_parser_.num_embedding_tables); - for (size_t i = 0; i < inference_parser_.num_embedding_tables; i++) { - keys_elements_list[i] = - inference_params_.max_batchsize * inference_parser_.max_feature_num_for_tables[i]; - row_ptr_elements_list[i] = - inference_params_.max_batchsize * inference_parser_.slot_num_for_tables[i] + 1; - } - std::vector pred_dims = {inference_params_.max_batchsize, inference_parser_.label_dim}; - std::shared_ptr pred_buff = - PreallocatedBuffer2::create(d_output_, pred_dims); - Tensor2 pred_tensor(pred_dims, pred_buff); - std::shared_ptr> metric = std::make_shared>( - inference_params_.max_batchsize, num_batches, inference_parser_.label_dim, resource_manager_); - metrics::RawMetricMap metric_maps = { - {metrics::RawType::Pred, pred_tensor.shrink()}, - {metrics::RawType::Label, - core_helper::convert_core23_tensor_to_tensorbag2(label_tensor_)}}; - for (size_t batch = 0; batch < num_batches; batch++) { - long long current_batchsize = data_reader_->read_a_batch_to_device(); - size_t keys_offset = 0; - size_t row_ptrs_offset = 0; - std::vector h_reader_keys(inference_params_.max_batchsize * - inference_parser_.max_feature_num_per_sample); - std::vector> h_reader_row_ptrs_list; - for (size_t i = 0; i < inference_parser_.num_embedding_tables; i++) { - std::vector h_reader_row_ptrs(row_ptr_elements_list[i]); - convert_array_on_device( - d_row_ptrs_ + row_ptrs_offset, reinterpret_cast(d_reader_row_ptrs_list_[i]), - row_ptr_elements_list[i], resource_manager_->get_local_gpu(0)->get_stream()); - HCTR_LIB_THROW(cudaMemcpyAsync(h_reader_row_ptrs.data(), d_reader_row_ptrs_list_[i], - row_ptr_elements_list[i] * sizeof(TypeKey), - cudaMemcpyDeviceToHost, - resource_manager_->get_local_gpu(0)->get_stream())); - HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(0)->get_stream())); - size_t num_keys = h_reader_row_ptrs.back() - h_reader_row_ptrs.front(); - h_reader_row_ptrs_list.push_back(h_reader_row_ptrs); - HCTR_LIB_THROW(cudaMemcpyAsync(h_reader_keys.data() + keys_offset, d_reader_keys_list_[i], - num_keys * sizeof(TypeKey), cudaMemcpyDeviceToHost, - resource_manager_->get_local_gpu(0)->get_stream())); - keys_offset += num_keys; - row_ptrs_offset += row_ptr_elements_list[i]; - } - distribute_keys_for_inference(reinterpret_cast(h_embeddingcolumns_), - h_reader_keys.data(), current_batchsize, h_reader_row_ptrs_list, - inference_parser_.slot_num_for_tables); - InferenceSession::predict(d_reader_dense_, h_embeddingcolumns_, d_row_ptrs_, d_output_, - current_batchsize); - metric->set_current_batch_size(current_batchsize); - metric->local_reduce(0, metric_maps); - } - float auc_value = metric->finalize_metric(); - return auc_value; -} - -template -pybind11::array_t InferenceSessionPy::predict_( - const size_t num_batches, const std::string& source, const DataReaderType_t data_reader_type, - const Check_t check_type, const std::vector& slot_size_array, - const long long num_samples, const DataSourceParams& data_source_params) { - CudaDeviceContext context(resource_manager_->get_local_gpu(0)->get_device_id()); - load_data(source, data_reader_type, check_type, slot_size_array, num_samples, - data_source_params); - std::vector keys_elements_list(inference_parser_.num_embedding_tables); - std::vector row_ptr_elements_list(inference_parser_.num_embedding_tables); - for (size_t i = 0; i < inference_parser_.num_embedding_tables; i++) { - keys_elements_list[i] = - inference_params_.max_batchsize * inference_parser_.max_feature_num_for_tables[i]; - row_ptr_elements_list[i] = - inference_params_.max_batchsize * inference_parser_.slot_num_for_tables[i] + 1; - } - std::vector pred_size; - if (inference_parser_.label_dim == 1) { - pred_size = {inference_params_.max_batchsize * num_batches}; - } else { - pred_size = {inference_params_.max_batchsize * num_batches, inference_parser_.label_dim}; - } - auto pred = pybind11::array_t(pred_size); - pybind11::buffer_info pred_array_buff = pred.request(); - float* pred_ptr = static_cast(pred_array_buff.ptr); - size_t pred_ptr_offset = 0; - for (size_t batch = 0; batch < num_batches; batch++) { - long long current_batchsize = data_reader_->read_a_batch_to_device(); - size_t keys_offset = 0; - size_t row_ptrs_offset = 0; - std::vector h_reader_keys(inference_params_.max_batchsize * - inference_parser_.max_feature_num_per_sample); - std::vector> h_reader_row_ptrs_list; - for (size_t i = 0; i < inference_parser_.num_embedding_tables; i++) { - std::vector h_reader_row_ptrs(row_ptr_elements_list[i]); - convert_array_on_device( - d_row_ptrs_ + row_ptrs_offset, reinterpret_cast(d_reader_row_ptrs_list_[i]), - row_ptr_elements_list[i], resource_manager_->get_local_gpu(0)->get_stream()); - HCTR_LIB_THROW(cudaMemcpyAsync(h_reader_row_ptrs.data(), d_reader_row_ptrs_list_[i], - row_ptr_elements_list[i] * sizeof(TypeKey), - cudaMemcpyDeviceToHost, - resource_manager_->get_local_gpu(0)->get_stream())); - HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(0)->get_stream())); - size_t num_keys = h_reader_row_ptrs.back() - h_reader_row_ptrs.front(); - h_reader_row_ptrs_list.push_back(h_reader_row_ptrs); - HCTR_LIB_THROW(cudaMemcpyAsync(h_reader_keys.data() + keys_offset, d_reader_keys_list_[i], - num_keys * sizeof(TypeKey), cudaMemcpyDeviceToHost, - resource_manager_->get_local_gpu(0)->get_stream())); - keys_offset += num_keys; - row_ptrs_offset += row_ptr_elements_list[i]; - } - distribute_keys_for_inference(reinterpret_cast(h_embeddingcolumns_), - h_reader_keys.data(), current_batchsize, h_reader_row_ptrs_list, - inference_parser_.slot_num_for_tables); - InferenceSession::predict(d_reader_dense_, h_embeddingcolumns_, d_row_ptrs_, d_output_, - current_batchsize); - HCTR_LIB_THROW(cudaMemcpyAsync( - pred_ptr + pred_ptr_offset, d_output_, - inference_params_.max_batchsize * inference_parser_.label_dim * sizeof(float), - cudaMemcpyDeviceToHost, resource_manager_->get_local_gpu(0)->get_stream())); - pred_ptr_offset += inference_params_.max_batchsize * inference_parser_.label_dim; - } - HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(0)->get_stream())); - return pred; -} - -float InferenceSessionPy::evaluate(const size_t num_batches, const std::string& source, - const DataReaderType_t data_reader_type, - const Check_t check_type, - const std::vector& slot_size_array, - const long long num_samples, - const DataSourceParams& data_source_params) { - float auc_value; - if (inference_params_.i64_input_key) { - auc_value = evaluate_(num_batches, source, data_reader_type, check_type, - slot_size_array, num_samples, data_source_params); - } else { - auc_value = evaluate_(num_batches, source, data_reader_type, check_type, - slot_size_array, num_samples, data_source_params); - } - return auc_value; -} - -pybind11::array_t InferenceSessionPy::predict( - const size_t num_batches, const std::string& source, const DataReaderType_t data_reader_type, - const Check_t check_type, const std::vector& slot_size_array, - const long long num_samples, const DataSourceParams& data_source_params) { - if (inference_params_.i64_input_key) { - return predict_(num_batches, source, data_reader_type, check_type, slot_size_array, - num_samples, data_source_params); - } else { - return predict_(num_batches, source, data_reader_type, check_type, - slot_size_array, num_samples, data_source_params); - } -} - -void InferenceSessionPy::refresh_embedding_cache() { - parameter_server_->refresh_embedding_cache(inference_params_.model_name, - inference_params_.device_id); -} - -std::shared_ptr CreateInferenceSession( - const std::string& model_config_path, const InferenceParams& inference_params) { - std::vector model_config_path_array{model_config_path}; - std::vector inference_params_array{inference_params}; - parameter_server_config ps_config{model_config_path_array, inference_params_array}; - - auto parameter_server = HierParameterServerBase::create(ps_config); - auto embedding_cache = parameter_server->get_embedding_cache(inference_params.model_name, - inference_params.device_id); - std::shared_ptr inference_session = std::make_shared( - model_config_path, inference_params, embedding_cache, parameter_server); - return inference_session; -} - -void InferencePybind(pybind11::module& m) { - pybind11::module infer = m.def_submodule("inference", "inference submodule of hugectr"); - - pybind11::class_>(infer, - "VolatileDatabaseParams") - .def( - pybind11::init&>(), - pybind11::arg("type") = DatabaseType_t::ParallelHashMap, - // Backend specific. - pybind11::arg("address") = "127.0.0.1:7000", pybind11::arg("user_name") = "default", - pybind11::arg("password") = "", - pybind11::arg("num_partitions") = std::min(16u, std::thread::hardware_concurrency()), - pybind11::arg("allocation_rate") = 256L * 1024L * 1024L, - pybind11::arg("shared_memory_size") = 16L * 1024L * 1024L * 1024L, - pybind11::arg("shared_memory_name") = "hctr_mp_hash_map_database", - pybind11::arg("shared_memory_auto_remove") = true, - pybind11::arg("num_node_connections") = 5, pybind11::arg("max_batch_size") = 64L * 1024L, - pybind11::arg("enable_tls") = false, - pybind11::arg("tls_ca_certificate") = "cacertbundle.crt", - pybind11::arg("tls_client_certificate") = "client_cert.pem", - pybind11::arg("tls_client_key") = "client_key.pem", - pybind11::arg("tls_server_name_identification") = "redis.localhost", - // Overflow handling related. - pybind11::arg("overflow_margin") = std::numeric_limits::max(), - pybind11::arg("overflow_policy") = DatabaseOverflowPolicy_t::EvictRandom, - pybind11::arg("overflow_resolution_target") = 0.8, - // Caching behavior related. - pybind11::arg("initialize_after_startup") = true, - pybind11::arg("initial_cache_rate") = 1.0, - pybind11::arg("cache_missed_embeddings") = false, - // Real-time update mechanism related. - pybind11::arg("update_filters") = std::vector{"^hps_.+$"}); - - pybind11::class_>(infer, - "PersistentDatabaseParams") - .def(pybind11::init&>(), - pybind11::arg("backend") = DatabaseType_t::Disabled, - // Backend specific. - pybind11::arg("path") = (std::filesystem::temp_directory_path() / "rocksdb").string(), - pybind11::arg("num_threads") = 16, pybind11::arg("read_only") = false, - pybind11::arg("max_batch_size") = 64L * 1024L, - // Caching behavior related. - pybind11::arg("initialize_after_startup") = true, - // Real-time update mechanism related. - pybind11::arg("update_filters") = std::vector{"^hps_.+$"}); - - pybind11::class_>( - infer, "UpdateSourceParams") - .def(pybind11::init(), - pybind11::arg("type") = UpdateSourceType_t::Null, - // Backend specific. - pybind11::arg("brokers") = "127.0.0.1:9092", - pybind11::arg("metadata_refresh_interval_ms") = 30'000, - pybind11::arg("receive_buffer_size") = 256 * 1024, - pybind11::arg("poll_timeout_ms") = 500, pybind11::arg("max_batch_size") = 8 * 1024, - pybind11::arg("failure_backoff_ms") = 50, pybind11::arg("max_commit_interval") = 32); - - pybind11::enum_(infer, "EmbeddingCacheType_t") - .value("Dynamic", EmbeddingCacheType_t::Dynamic) - .value("UVM", EmbeddingCacheType_t::UVM) - .value("Static", EmbeddingCacheType_t::Static) - .value(hctr_enum_to_c_str(EmbeddingCacheType_t::Stochastic), EmbeddingCacheType_t::Stochastic) - .export_values(); - - pybind11::class_>( - infer, "InferenceParams") - .def(pybind11::init&, const int, const bool, const float, - const bool, const bool, const float, const bool, const bool, - // HugeCTR::DATABASE_TYPE, const std::string&, const std::string&, - // const float, - const int, const int, const int, const float, const std::vector&, - const std::vector&, const VolatileDatabaseParams&, - const PersistentDatabaseParams&, const UpdateSourceParams&, const int, - const float, const float, const std::vector&, - const std::vector&, const std::vector&, - const std::string&, const size_t, const size_t, const std::string&, bool, - const EmbeddingCacheType_t&, bool, bool, bool, bool, bool, bool>(), - - pybind11::arg("model_name"), pybind11::arg("max_batchsize"), - pybind11::arg("hit_rate_threshold"), pybind11::arg("dense_model_file"), - pybind11::arg("sparse_model_files"), pybind11::arg("device_id") = 0, - pybind11::arg("use_gpu_embedding_cache"), pybind11::arg("cache_size_percentage"), - pybind11::arg("i64_input_key"), pybind11::arg("use_mixed_precision") = false, - pybind11::arg("scaler") = 1.0, pybind11::arg("use_algorithm_search") = true, - pybind11::arg("use_cuda_graph") = true, - pybind11::arg("number_of_worker_buffers_in_pool") = 2, - pybind11::arg("number_of_refresh_buffers_in_pool") = 1, - pybind11::arg("thread_pool_size") = 16, - pybind11::arg("cache_refresh_percentage_per_iteration") = 0.0, - pybind11::arg("deployed_devices") = std::vector{0}, - pybind11::arg("default_value_for_each_table") = std::vector{0.0f}, - // Database backend. - pybind11::arg("volatile_db") = VolatileDatabaseParams{}, - pybind11::arg("persistent_db") = PersistentDatabaseParams{}, - pybind11::arg("update_source") = UpdateSourceParams{}, - // HPS required - pybind11::arg("maxnum_des_feature_per_sample") = 26, - pybind11::arg("refresh_delay") = 0.0f, pybind11::arg("refresh_interval") = 0.0f, - pybind11::arg("maxnum_catfeature_query_per_table_per_sample") = std::vector{26}, - pybind11::arg("embedding_vecsize_per_table") = std::vector{128}, - pybind11::arg("embedding_table_names") = std::vector{""}, - pybind11::arg("network_file") = "", pybind11::arg("label_dim") = 1, - pybind11::arg("slot_num") = 10, pybind11::arg("non_trainable_params_file") = "", - pybind11::arg("use_static_table") = false, - pybind11::arg("embedding_cache_type") = EmbeddingCacheType_t::Dynamic, - pybind11::arg("use_context_stream") = true, - pybind11::arg("fuse_embedding_table") = false, - pybind11::arg("use_hctr_cache_implementation") = true, pybind11::arg("init_ec") = true, - pybind11::arg("enable_pagelock") = false, pybind11::arg("fp8_quant") = false); - - infer.def("CreateInferenceSession", &HugeCTR::python_lib::CreateInferenceSession, - pybind11::arg("model_config_path"), pybind11::arg("inference_params")); - - pybind11::class_>(infer, - "InferenceSession") - .def("evaluate", &HugeCTR::python_lib::InferenceSessionPy::evaluate, - pybind11::arg("num_batches"), pybind11::arg("source"), pybind11::arg("data_reader_type"), - pybind11::arg("check_type"), pybind11::arg("slot_size_array") = std::vector(), - pybind11::arg("num_samples") = 0, - pybind11::arg("data_source_params") = DataSourceParams()) - .def("predict", - pybind11::overload_cast&, long long, - const DataSourceParams&>( - &HugeCTR::python_lib::InferenceSessionPy::predict), - pybind11::arg("num_batches"), pybind11::arg("source"), pybind11::arg("data_reader_type"), - pybind11::arg("check_type"), pybind11::arg("slot_size_array") = std::vector(), - pybind11::arg("num_samples") = 0, - pybind11::arg("data_source_params") = DataSourceParams()) - .def("predict", - pybind11::overload_cast&, const std::vector&, - const std::vector&>( - &HugeCTR::python_lib::InferenceSessionPy::predict), - pybind11::arg("dense_feature"), pybind11::arg("embeddingcolumns"), - pybind11::arg("row_ptrs")) - .def("refresh_embedding_cache", - &HugeCTR::python_lib::InferenceSessionPy::refresh_embedding_cache); - pybind11::class_>( - infer, "InferenceModel") - .def(pybind11::init(), - pybind11::arg("model_config_path"), pybind11::arg("inference_params")) - .def( - "predict", - [](HugeCTR::InferenceModel& self, size_t num_batches, const std::string& source, - DataReaderType_t data_reader_type, Check_t check_type, - const std::vector& slot_size_array, - const DataSourceParams& data_source_params, bool read_file_seq) { - auto& inference_params = self.get_inference_params(); - auto& inference_parser = self.get_inference_parser(); - float* pred_output = new float[num_batches * inference_params.max_batchsize * - inference_parser.label_dim]; - self.predict(pred_output, num_batches, source, data_reader_type, check_type, - slot_size_array, data_source_params, read_file_seq); - auto pred_output_capsule = pybind11::capsule(pred_output, [](void* v) { - float* vv = reinterpret_cast(v); - delete[] vv; - }); - pybind11::array_t pred_array( - {num_batches * inference_params.max_batchsize, inference_parser.label_dim}, - pred_output, pred_output_capsule); - return pred_array; - }, - pybind11::arg("num_batches"), pybind11::arg("source"), pybind11::arg("data_reader_type"), - pybind11::arg("check_type"), pybind11::arg("slot_size_array") = std::vector(), - pybind11::arg("data_source_params") = DataSourceParams(), - pybind11::arg("reading_file_sequentially") = true) - .def("evaluate", &HugeCTR::InferenceModel::evaluate, pybind11::arg("num_batches"), - pybind11::arg("source"), pybind11::arg("data_reader_type"), pybind11::arg("check_type"), - pybind11::arg("slot_size_array") = std::vector(), - pybind11::arg("data_source_params") = DataSourceParams(), - pybind11::arg("reading_file_sequentially") = true) - .def( - "check_out_tensor", - [](HugeCTR::InferenceModel& self, const std::string& tensor_name) { - const auto tensor_info_tuple = self.get_tensor_info_by_name(tensor_name); - float* local_result = new float[std::get<1>(tensor_info_tuple)]; - auto local_result_capsule = pybind11::capsule(local_result, [](void* v) { - float* vv = reinterpret_cast(v); - delete[] vv; - }); - self.check_out_tensor(std::get<3>(tensor_info_tuple), local_result); - pybind11::array_t tensor_result(std::get<2>(tensor_info_tuple), local_result, - local_result_capsule); - return tensor_result; - }, - pybind11::arg("tensor_name")); -} - -} // namespace python_lib - -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/src/CMakeLists.txt b/HugeCTR/src/CMakeLists.txt index 3a1eb02d79..840b23a3d5 100755 --- a/HugeCTR/src/CMakeLists.txt +++ b/HugeCTR/src/CMakeLists.txt @@ -48,6 +48,7 @@ file( list(REMOVE_ITEM huge_ctr_src "pybind/module_main.cpp") list(REMOVE_ITEM huge_ctr_src "inference_benchmark/metrics.cpp") + if(DISABLE_CUDF) list(REMOVE_ITEM huge_ctr_src "data_readers/file_source_parquet.cpp") list(REMOVE_ITEM huge_ctr_src "data_readers/metadata.cpp") diff --git a/HugeCTR/src/core23_network.cpp b/HugeCTR/src/core23_network.cpp index 90f8a5180c..0e26cdd690 100644 --- a/HugeCTR/src/core23_network.cpp +++ b/HugeCTR/src/core23_network.cpp @@ -179,50 +179,6 @@ void Core23TempNetwork::upload_params_to_device(const std::string& model_file) { return; } -void Core23TempNetwork::upload_params_to_device_inference(const std::string& model_file) { - auto fs = FileSystemBuilder::build_unique_by_path(model_file); - CudaDeviceContext context(get_device_id()); - - std::unique_ptr params(new char[evaluate_weight_tensor_->num_bytes()]); - fs->read(model_file, params.get(), evaluate_weight_tensor_->num_bytes(), 0); - HCTR_LIB_THROW(cudaMemcpyAsync(evaluate_weight_tensor_->data(), params.get(), - evaluate_weight_tensor_->num_bytes(), cudaMemcpyHostToDevice, - gpu_resource_->get_stream())); - if (use_mixed_precision_) { - conv_weight_(evaluate_weight_tensor_half_, evaluate_weight_tensor_); - } - return; -} - -void Core23TempNetwork::upload_non_trainable_params_to_device_inference( - const std::string& model_file) { - HCTR_LOG(INFO, ROOT, "Upload non-trainable parameters from JSON file to inference layers\n"); - const nlohmann::json& params_json(read_json_file(model_file)); - const nlohmann::json& params_for_layers = get_json(params_json, "layers"); - size_t counter = 0; - CudaDeviceContext context(get_device_id()); - for (size_t i{0}; i < evaluate_layers_.size(); ++i) { - auto params_tensors = evaluate_layers_[i]->get_non_trainable_params_as_tensors(); - if (!params_tensors.empty()) { - const nlohmann::json& params = params_for_layers[counter]; - std::string layer_type = get_value_from_json(params, "type"); - if (layer_type == "BatchNorm") { - std::vector running_mean = get_json(params, "mean"); - std::vector running_variance = get_json(params, "var"); - HCTR_LIB_THROW(cudaMemcpyAsync(params_tensors[0].data(), running_mean.data(), - params_tensors[0].num_bytes(), cudaMemcpyHostToDevice, - gpu_resource_->get_stream())); - HCTR_LIB_THROW(cudaMemcpyAsync(params_tensors[1].data(), running_variance.data(), - params_tensors[1].num_bytes(), cudaMemcpyHostToDevice, - gpu_resource_->get_stream())); - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "Only BatchNorm layer has non-trainable parameters"); - } - ++counter; - } - } -} - void Core23TempNetwork::download_params_to_host(float* weight) { CudaDeviceContext context(get_device_id()); diff --git a/HugeCTR/src/cpu/CMakeLists.txt b/HugeCTR/src/cpu/CMakeLists.txt deleted file mode 100644 index cd81caf358..0000000000 --- a/HugeCTR/src/cpu/CMakeLists.txt +++ /dev/null @@ -1,55 +0,0 @@ -# -# Copyright (c) 2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -file(GLOB cpu_inference_src - layers/add_layer_cpu.cpp - layers/batch_norm_layer_cpu.cpp - layers/cast_layer_cpu.cpp - layers/concat_layer_cpu.cpp - layers/dropout_layer_cpu.cpp - layers/elu_layer_cpu.cpp - layers/fm_order2_layer_cpu.cpp - layers/fully_connected_layer_cpu.cpp - layers/fully_connected_layer_half_cpu.cpp - layers/fused_fully_connected_layer_cpu.cpp - layers/interaction_layer_cpu.cpp - layers/multi_cross_layer_cpu.cpp - layers/reduce_sum_layer_cpu.cpp - layers/relu_layer_cpu.cpp - layers/reshape_layer_cpu.cpp - layers/sigmoid_layer_cpu.cpp - layers/slice_layer_cpu.cpp - layers/weight_multiply_layer_cpu.cpp - network_cpu.cpp - embedding_feature_combiner_cpu.cpp - create_network_cpu.cpp - create_embedding_cpu.cpp - create_pipeline_cpu.cpp - inference_session_cpu.cpp -) - -add_library(cpu_inference_shared SHARED ${cpu_inference_src}) - -target_link_libraries(cpu_inference_shared PUBLIC ${CUDART_LIB} CUDA::cublas CUDA::curand cudnn nccl) - -target_link_libraries(cpu_inference_shared PUBLIC ${CMAKE_THREAD_LIBS_INIT}) - -target_link_libraries(cpu_inference_shared PRIVATE nlohmann_json::nlohmann_json) - -target_link_libraries(cpu_inference_shared PUBLIC huge_ctr_inference) - -target_compile_features(cpu_inference_shared PUBLIC cxx_std_17) - - diff --git a/HugeCTR/src/cpu/create_embedding_cpu.cpp b/HugeCTR/src/cpu/create_embedding_cpu.cpp deleted file mode 100644 index 0fb3573f3b..0000000000 --- a/HugeCTR/src/cpu/create_embedding_cpu.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -namespace HugeCTR { - -template -void create_embedding_cpu::operator()( - const InferenceParams& inference_params, const nlohmann::json& j_layers_array, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, std::vector* tensor_entries, - std::vector>* embeddings, - std::shared_ptr>& blobs_buff) { - HCTR_LOG(INFO, ROOT, "start create embedding for inference\n"); - auto j_data = j_layers_array[0]; - if (!has_key_(j_data, "sparse")) { - HCTR_LOG(INFO, ROOT, "no sparse data input\n"); - return; - } - auto j_sparse_input = get_json(j_data, "sparse"); - std::unordered_map> slot_nums_map; - for (unsigned int i = 0; i < j_sparse_input.size(); ++i) { - auto top = get_value_from_json(j_sparse_input[i], "top"); - auto slot_num = get_value_from_json(j_sparse_input[i], "slot_num"); - auto max_feature_num_per_sample = - get_max_feature_num_per_sample_from_nnz_per_slot(j_sparse_input[i]); - HCTR_LOG_S(INFO, ROOT) << "sparse_input name " << top << std::endl; - slot_nums_map[top] = std::make_pair(slot_num, max_feature_num_per_sample); - } - if (j_layers_array.size() < 1) { - HCTR_OWN_THROW(Error_t::WrongInput, "layer not defined in config"); - } - for (unsigned int i = 1; i < j_layers_array.size(); i++) { - const nlohmann::json& j = j_layers_array[i]; - auto bottom_array = get_json(j, "bottom"); - if (bottom_array.is_array()) { - continue; - } - std::string bottom = bottom_array.get(); - ; - auto slot_nums_map_iter = slot_nums_map.find(bottom); - if (slot_nums_map_iter == slot_nums_map.end()) { - continue; - } - const std::string layer_top = get_value_from_json(j, "top"); - int slot_num = slot_nums_map_iter->second.first; - int max_feature_num_per_sample = slot_nums_map_iter->second.second; - auto j_hparam = get_json(j, "sparse_embedding_hparam"); - auto combiner = get_value_from_json(j_hparam, "combiner"); - EmbeddingFeatureCombiner_t feature_combiner_type; - if (combiner == "sum") { - feature_combiner_type = EmbeddingFeatureCombiner_t::Sum; - } else if (combiner == "mean") { - feature_combiner_type = EmbeddingFeatureCombiner_t::Mean; - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "combiner need to be 0 or 1"); - } - size_t embedding_vec_size = get_value_from_json(j_hparam, "embedding_vec_size"); - - size_t prefix_slot_num = embedding_table_slot_size.back(); - embedding_table_slot_size.push_back(prefix_slot_num + slot_num); - - std::vector row_dims = { - static_cast(inference_params.max_batchsize * slot_num + 1)}; - std::vector embeddingvecs_dims = { - static_cast(inference_params.max_batchsize * max_feature_num_per_sample), - static_cast(embedding_vec_size)}; - std::shared_ptr> row_tensor = std::make_shared>(); - std::shared_ptr> embeddingvecs_tensor = std::make_shared>(); - blobs_buff->reserve(row_dims, row_tensor.get()); - blobs_buff->reserve(embeddingvecs_dims, embeddingvecs_tensor.get()); - rows.push_back(row_tensor); - embeddingvecs.push_back(embeddingvecs_tensor); - Tensor2 embedding_output; - embeddings->push_back(std::make_shared>( - embeddingvecs[0], rows[0], embedding_output, inference_params.max_batchsize, slot_num, - feature_combiner_type, blobs_buff)); - tensor_entries->push_back({layer_top, embedding_output.shrink()}); - } - HCTR_LOG(INFO, ROOT, "create cpu embedding for inference success\n"); -} - -template struct create_embedding_cpu; -template struct create_embedding_cpu<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/create_network_cpu.cpp b/HugeCTR/src/cpu/create_network_cpu.cpp deleted file mode 100644 index 64adfc9787..0000000000 --- a/HugeCTR/src/cpu/create_network_cpu.cpp +++ /dev/null @@ -1,688 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef ENABLE_MPI -#include -#endif - -namespace HugeCTR { - -struct InputOutputInfo { - std::vector inputs; - std::vector output_names; -}; - -static bool get_tensor_from_entries(const std::vector tensor_entries, - const std::string& name, TensorBag2* bag) { - for (const TensorEntry& entry : tensor_entries) { - if (entry.name == name) { - *bag = entry.bag; - return true; - } - } - return false; -} - -static InputOutputInfo get_input_tensor_and_output_name( - const nlohmann::json& json, const std::vector& tensor_entries) { - auto bottom = get_json(json, "bottom"); - auto top = get_json(json, "top"); - - std::vector bottom_names = get_layer_names(bottom); - std::vector top_names = get_layer_names(top); - - std::vector bottom_bags; - - for (auto& bottom_name : bottom_names) { - for (auto& top_name : top_names) { - if (bottom_name == top_name) { - HCTR_OWN_THROW(Error_t::WrongInput, "bottom and top include a same layer name"); - } - } - TensorBag2 bag; - if (!get_tensor_from_entries(tensor_entries, bottom_name, &bag)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such bottom: " + bottom_name); - } - bottom_bags.push_back(bag); - } - return {bottom_bags, top_names}; -} - -void create_layers(const nlohmann::json& j_array, std::vector& tensor_entries, - const std::shared_ptr>& blobs_buff, - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& weight_buff_half, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& wgrad_buff_half, - bool use_mixed_precision, std::vector>& layers) { - for (unsigned int i = 1; i < j_array.size(); i++) { - const nlohmann::json& j = j_array[i]; - const auto layer_type_name = get_value_from_json(j, "type"); - Layer_t layer_type; - - const auto& layer_map = use_mixed_precision ? LAYER_TYPE_MAP_MP : LAYER_TYPE_MAP; - - if (!find_item_in_map(layer_type, layer_type_name, layer_map)) { - Embedding_t embedding_type; - if (!find_item_in_map(embedding_type, layer_type_name, EMBEDDING_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such layer: " + layer_type_name); - } - continue; - } - - std::vector output_tensor_entries; - auto input_output_info = get_input_tensor_and_output_name(j, tensor_entries); - if (layer_type == Layer_t::CrossEntropyLoss || layer_type == Layer_t::BinaryCrossEntropyLoss || - layer_type == Layer_t::MultiCrossEntropyLoss) { - HCTR_OWN_THROW(Error_t::WrongInput, "Loss layer is not supported for NetworkCPU"); - } - switch (layer_type) { - case Layer_t::BatchNorm: { - // get BN params - auto j_bn_hparam = get_json(j, "bn_param"); - auto factor = get_value_from_json(j_bn_hparam, "factor"); - auto eps = get_value_from_json(j_bn_hparam, "eps"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_bn_hparam, "gamma_init")) { - const auto gamma_init_name = get_value_from_json(j_bn_hparam, "gamma_init"); - Initializer_t gamma_init_type; - if (!find_item_in_map(gamma_init_type, gamma_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + gamma_init_name); - } else { - initializer_types[0] = gamma_init_type; - } - } - if (has_key_(j_bn_hparam, "beta_init")) { - const auto beta_init_name = get_value_from_json(j_bn_hparam, "beta_init"); - Initializer_t beta_init_type; - if (!find_item_in_map(beta_init_type, beta_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + beta_init_name); - } else { - initializer_types[1] = beta_init_type; - } - } - - if (use_mixed_precision) { - Tensor2<__half> bn_in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - // establish out tensor - Tensor2<__half> bn_out_tensor; - blobs_buff->reserve(bn_in_tensor.get_dimensions(), &bn_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], bn_out_tensor.shrink()}); - - BatchNormLayerCPU<__half>::Params params = {factor, eps}; - layers.emplace_back(new BatchNormLayerCPU<__half>(weight_buff, wgrad_buff, blobs_buff, - bn_in_tensor, bn_out_tensor, params)); - } else { - Tensor2 bn_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - // establish out tensor - Tensor2 bn_out_tensor; - blobs_buff->reserve(bn_in_tensor.get_dimensions(), &bn_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], bn_out_tensor.shrink()}); - - BatchNormLayerCPU::Params params = {factor, eps}; - layers.emplace_back(new BatchNormLayerCPU(weight_buff, wgrad_buff, blobs_buff, - bn_in_tensor, bn_out_tensor, params)); - } - break; - } - case Layer_t::Concat: { - if (use_mixed_precision) { - Tensors2<__half> in_tensors; - for (const TensorBag2& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2<__half>::stretch_from(bag)); - } - Tensor2<__half> out_tensor; - layers.emplace_back(new ConcatLayerCPU<__half>(in_tensors, out_tensor, blobs_buff)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensors2 in_tensors; - for (const TensorBag2& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2::stretch_from(bag)); - } - Tensor2 out_tensor; - layers.emplace_back(new ConcatLayerCPU(in_tensors, out_tensor, blobs_buff)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - case Layer_t::Dropout: { - if (use_mixed_precision) { - Tensor2<__half> do_in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - // establish out tensor - Tensor2<__half> do_out_tensor; - blobs_buff->reserve(do_in_tensor.get_dimensions(), &do_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], do_out_tensor.shrink()}); - // get ELU params - auto rate_it = j.find("rate"); - auto rate = (rate_it != j.end()) ? rate_it->get() : 0.5f; - layers.emplace_back( - new DropoutLayerCPU<__half>(do_in_tensor, do_out_tensor, blobs_buff, rate)); - } else { - // establish out tensor - Tensor2 do_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 do_out_tensor; - blobs_buff->reserve(do_in_tensor.get_dimensions(), &do_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], do_out_tensor.shrink()}); - // get ELU params - auto rate_it = j.find("rate"); - auto rate = (rate_it != j.end()) ? rate_it->get() : 0.5f; - layers.emplace_back( - new DropoutLayerCPU(do_in_tensor, do_out_tensor, blobs_buff, rate)); - } - break; - } - case Layer_t::ELU: { - if (use_mixed_precision) { - Tensor2<__half> elu_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - - // establish out tensor - Tensor2<__half> elu_out_tensor; - blobs_buff->reserve(elu_in_tensor.get_dimensions(), &elu_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], elu_out_tensor.shrink()}); - // get ELU params - auto j_elu_hparam = get_json(j, "elu_param"); - auto alpha = get_value_from_json(j_elu_hparam, "alpha"); - layers.emplace_back(new EluLayerCPU<__half>(elu_in_tensor, elu_out_tensor, alpha)); - - } else { - Tensor2 elu_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - - // establish out tensor - Tensor2 elu_out_tensor; - blobs_buff->reserve(elu_in_tensor.get_dimensions(), &elu_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], elu_out_tensor.shrink()}); - // get ELU params - auto j_elu_hparam = get_json(j, "elu_param"); - auto alpha = get_value_from_json(j_elu_hparam, "alpha"); - layers.emplace_back(new EluLayerCPU(elu_in_tensor, elu_out_tensor, alpha)); - } - break; - } - - case Layer_t::FusedInnerProduct: { - auto j_fc_param = get_json(j, "fc_param"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_fc_param, "weight_init")) { - const auto weight_init_name = get_value_from_json(j_fc_param, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - if (has_key_(j_fc_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_fc_param, "bias_init"); - Initializer_t bias_init_type; - if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } else { - initializer_types[1] = bias_init_type; - } - } - // establish out tensor - auto output = get_value_from_json(j_fc_param, "num_output"); - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> fc_out_tensor; - blobs_buff->reserve({(in_tensor.get_dimensions())[0], output}, &fc_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], fc_out_tensor.shrink()}); - - // establish layer - layers.emplace_back(new FusedFullyConnectedLayerCPU(weight_buff, weight_buff_half, - wgrad_buff_half, blobs_buff, - in_tensor, fc_out_tensor)); - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "FusedInnerProduct support half only"); - } - break; - } - - case Layer_t::Cast: { - if (use_mixed_precision) { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - layers.emplace_back(new CastLayerCPU(in_tensor, out_tensor)); - } else { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - layers.emplace_back(new CastLayerCPU<__half, float>(in_tensor, out_tensor)); - } - break; - } - - case Layer_t::InnerProduct: { - auto j_fc_param = get_json(j, "fc_param"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_fc_param, "weight_init")) { - const auto weight_init_name = get_value_from_json(j_fc_param, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - if (has_key_(j_fc_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_fc_param, "bias_init"); - Initializer_t bias_init_type; - if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } else { - initializer_types[1] = bias_init_type; - } - } - - // establish out tensor - auto output = get_value_from_json(j_fc_param, "num_output"); - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> fc_out_tensor; - blobs_buff->reserve({in_tensor.get_dimensions()[0], output}, &fc_out_tensor); - - // establish layer - layers.emplace_back(new FullyConnectedLayerCPU<__half>(weight_buff, weight_buff_half, - wgrad_buff_half, blobs_buff, - in_tensor, fc_out_tensor)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], fc_out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 fc_out_tensor; - blobs_buff->reserve({in_tensor.get_dimensions()[0], output}, &fc_out_tensor); - // establish layer - layers.emplace_back(new FullyConnectedLayerCPU( - weight_buff, wgrad_buff, in_tensor, fc_out_tensor, use_mixed_precision)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], fc_out_tensor.shrink()}); - } - break; - } - - case Layer_t::Interaction: { - // lambda template could be a better solution here, but there's not support in c++11 - if (use_mixed_precision) { - Tensor2<__half> in_mlp_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> in_emb_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[1]); - Tensor2<__half> out_tensor; - - layers.emplace_back(new InteractionLayerCPU<__half>( - in_mlp_tensor, in_emb_tensor, out_tensor, - blobs_buff, // todo cannot use this blobs_buff here need half - use_mixed_precision)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - - } else { - Tensor2 in_mlp_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 in_emb_tensor = Tensor2::stretch_from(input_output_info.inputs[1]); - Tensor2 out_tensor; - layers.emplace_back(new InteractionLayerCPU( - in_mlp_tensor, in_emb_tensor, out_tensor, blobs_buff, use_mixed_precision)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - - break; - } - case Layer_t::MultiCross: { - auto j_mc_param = get_json(j, "mc_param"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_mc_param, "weight_init")) { - const auto weight_init_name = get_value_from_json(j_mc_param, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - if (has_key_(j_mc_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_mc_param, "bias_init"); - Initializer_t bias_init_type; - if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } else { - initializer_types[1] = bias_init_type; - } - } - - // establish out tensor - auto num_layers = get_value_from_json(j_mc_param, "num_layers"); - Tensor2 mc_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - blobs_buff->reserve(mc_in_tensor.get_dimensions(), &out_tensor); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - // establish layer - layers.emplace_back(new MultiCrossLayerCPU(weight_buff, wgrad_buff, blobs_buff, - mc_in_tensor, out_tensor, num_layers)); - break; - } - case Layer_t::ReLU: { - if (use_mixed_precision) { - Tensor2<__half> relu_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> relu_out_tensor; - blobs_buff->reserve(relu_in_tensor.get_dimensions(), &relu_out_tensor); - layers.emplace_back(new ReluLayerCPU<__half>(relu_in_tensor, relu_out_tensor)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], relu_out_tensor.shrink()}); - } else { - // establish out tensor - Tensor2 relu_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 relu_out_tensor; - blobs_buff->reserve(relu_in_tensor.get_dimensions(), &relu_out_tensor); - layers.emplace_back(new ReluLayerCPU(relu_in_tensor, relu_out_tensor)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], relu_out_tensor.shrink()}); - } - - break; - } - case Layer_t::Reshape: { - auto selected_it = j.find("selected"); - // selective reshape - if (selected_it != j.end()) { - std::vector selected; - nlohmann::json j_selected = (selected_it.value()); - for (auto slot_obj : j_selected) { - int slot_id = slot_obj.get(); - if (slot_id < 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "slot_id < 0"); - } - selected.push_back(slot_id); - } - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - layers.emplace_back( - new ReshapeLayerCPU<__half>(in_tensor, out_tensor, blobs_buff, selected)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - layers.emplace_back( - new ReshapeLayerCPU(in_tensor, out_tensor, blobs_buff, selected)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - } - } - // general purpose reshape - else { - auto leading_dim_it = j.find("leading_dim"); - - // if leading_dim is not specified, default leading_dim = n_slots * vector_length - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - const auto& in_dims = in_tensor.get_dimensions(); - size_t leading_dim = (leading_dim_it != j.end()) - ? (*leading_dim_it).get() - : in_tensor.get_num_elements() / in_dims[0]; - layers.emplace_back( - new ReshapeLayerCPU<__half>(in_tensor, out_tensor, blobs_buff, leading_dim)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - const auto& in_dims = in_tensor.get_dimensions(); - size_t leading_dim = (leading_dim_it != j.end()) - ? (*leading_dim_it).get() - : in_tensor.get_num_elements() / in_dims[0]; - layers.emplace_back( - new ReshapeLayerCPU(in_tensor, out_tensor, blobs_buff, leading_dim)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - } - } - break; - } - case Layer_t::Sigmoid: { - if (use_mixed_precision) { - Tensor2<__half> sigmoid_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> sigmoid_out_tensor; - blobs_buff->reserve(sigmoid_in_tensor.get_dimensions(), &sigmoid_out_tensor); - layers.emplace_back(new SigmoidLayerCPU<__half>(sigmoid_in_tensor, sigmoid_out_tensor)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - } else { - // establish out tensor - Tensor2 sigmoid_in_tensor = - Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 sigmoid_out_tensor; - blobs_buff->reserve(sigmoid_in_tensor.get_dimensions(), &sigmoid_out_tensor); - layers.emplace_back(new SigmoidLayerCPU(sigmoid_in_tensor, sigmoid_out_tensor)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - } - break; - } - case Layer_t::Slice: { - std::vector> ranges; - auto j_ranges = get_json(j, "ranges"); - assert(j_ranges.is_array()); - for (auto j_range : j_ranges) { - assert(j_range.is_array()); - ranges.emplace_back(std::make_pair(j_range[0].get(), j_range[1].get())); - } - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensors2<__half> out_tensors; - layers.emplace_back( - new SliceLayerCPU<__half>(in_tensor, out_tensors, blobs_buff, ranges)); - for (size_t i = 0; i < out_tensors.size(); i++) { - output_tensor_entries.push_back( - {input_output_info.output_names[i], out_tensors[i].shrink()}); - } - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensors2 out_tensors; - layers.emplace_back(new SliceLayerCPU(in_tensor, out_tensors, blobs_buff, ranges)); - for (size_t i = 0; i < out_tensors.size(); i++) { - output_tensor_entries.push_back( - {input_output_info.output_names[i], out_tensors[i].shrink()}); - } - } - break; - } - case Layer_t::WeightMultiply: { - std::vector weight_dims; - auto dims = get_json(j, "weight_dims"); - assert(dims.is_array()); - for (auto dim : dims) { - weight_dims.emplace_back(dim.get()); - } - - // establish initializer - std::vector initializer_types(1, Initializer_t::Default); - if (has_key_(j, "weight_init")) { - const auto weight_init_name = get_value_from_json(j, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - layers.emplace_back(new WeightMultiplyLayerCPU<__half>( - weight_buff_half, wgrad_buff_half, blobs_buff, in_tensor, out_tensor, weight_dims)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - layers.emplace_back(new WeightMultiplyLayerCPU( - weight_buff, wgrad_buff, blobs_buff, in_tensor, out_tensor, weight_dims)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - case Layer_t::FmOrder2: { - auto out_dim = get_json(j, "out_dim").get(); - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - blobs_buff->reserve({in_tensor.get_dimensions()[0], out_dim}, &out_tensor); - - layers.emplace_back(new FmOrder2LayerCPU<__half>(in_tensor, out_tensor)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - blobs_buff->reserve({in_tensor.get_dimensions()[0], out_dim}, &out_tensor); - - layers.emplace_back(new FmOrder2LayerCPU(in_tensor, out_tensor)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - case Layer_t::Add: { - if (use_mixed_precision) { - Tensors2<__half> in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2<__half>::stretch_from(bag)); - } - Tensor2<__half> out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - layers.emplace_back(new AddLayerCPU<__half>(in_tensors, out_tensor, blobs_buff)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensors2 in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2::stretch_from(bag)); - } - Tensor2 out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - layers.emplace_back(new AddLayerCPU(in_tensors, out_tensor, blobs_buff)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - case Layer_t::ReduceSum: { - int axis = get_json(j, "axis").get(); - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - layers.emplace_back( - new ReduceSumLayerCPU<__half>(in_tensor, out_tensor, blobs_buff, axis)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - layers.emplace_back( - new ReduceSumLayerCPU(in_tensor, out_tensor, blobs_buff, axis)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - default: - assert(!"Error: no such layer && should never get here!"); - } // end of switch - - for (auto& output_tensor_entry : output_tensor_entries) { - tensor_entries.push_back(output_tensor_entry); - } - } // for layers - for (auto entry : tensor_entries) { - HCTR_LOG_S(INFO, WORLD) << "layer: " << entry.name << std::endl; - } -} - -/* - * Create single network - * - */ -NetworkCPU* NetworkCPU::create_network(const nlohmann::json& j_array, - std::vector& tensor_entries, - const std::shared_ptr& cpu_resource, - bool use_mixed_precision) { - NetworkCPU* network = new NetworkCPU(cpu_resource, use_mixed_precision); - - auto& layers = network->layers_; - - std::shared_ptr> blobs_buff = - GeneralBuffer2::create(); - - std::shared_ptr> weight_buff = blobs_buff->create_block(); - std::shared_ptr> weight_buff_half = blobs_buff->create_block<__half>(); - std::shared_ptr> wgrad_buff = blobs_buff->create_block(); - std::shared_ptr> wgrad_buff_half = blobs_buff->create_block<__half>(); - - // create layers - create_layers(j_array, tensor_entries, blobs_buff, weight_buff, weight_buff_half, wgrad_buff, - wgrad_buff_half, use_mixed_precision, layers); - - TensorEntry pred_tensor_entry = tensor_entries.back(); - network->pred_tensor_ = Tensor2::stretch_from(pred_tensor_entry.bag); - network->weight_tensor_ = weight_buff->as_tensor(); - network->weight_tensor_half_ = weight_buff_half->as_tensor(); - network->wgrad_tensor_ = wgrad_buff->as_tensor(); - network->wgrad_tensor_half_ = wgrad_buff_half->as_tensor(); - blobs_buff->allocate(); - - return network; -} - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/create_pipeline_cpu.cpp b/HugeCTR/src/cpu/create_pipeline_cpu.cpp deleted file mode 100644 index fac2514e47..0000000000 --- a/HugeCTR/src/cpu/create_pipeline_cpu.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -namespace HugeCTR { - -template -void create_pipeline_inference_cpu(const nlohmann::json& config, - std::map tensor_active, - const InferenceParams& inference_params, - Tensor2& dense_input, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embeddings, - NetworkCPU** network, - const std::shared_ptr& cpu_resource) { - std::vector tensor_entries; - - auto j_layers_array = get_json(config, "layers"); - check_graph(tensor_active, j_layers_array); - - auto input_buffer = GeneralBuffer2::create(); - - { - const nlohmann::json& j_data = j_layers_array[0]; - auto j_dense = get_json(j_data, "dense"); - auto top_strs_dense = get_value_from_json(j_dense, "top"); - auto dense_dim = get_value_from_json(j_dense, "dense_dim"); - - input_buffer->reserve({inference_params.max_batchsize, dense_dim}, &dense_input); - tensor_entries.push_back({top_strs_dense, dense_input.shrink()}); - } - - create_embedding_cpu()(inference_params, j_layers_array, rows, embeddingvecs, - embedding_table_slot_size, &tensor_entries, embeddings, - input_buffer); - input_buffer->allocate(); - - *network = NetworkCPU::create_network(j_layers_array, tensor_entries, cpu_resource, - inference_params.use_mixed_precision); -} - -void create_pipeline_cpu(const nlohmann::json& config, std::map tensor_active, - const InferenceParams& inference_params, Tensor2& dense_input, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embeddings, NetworkCPU** network, - const std::shared_ptr& cpu_resource) { - if (inference_params.use_mixed_precision) { - create_pipeline_inference_cpu<__half>(config, tensor_active, inference_params, dense_input, - rows, embeddingvecs, embedding_table_slot_size, - embeddings, network, cpu_resource); - } else { - create_pipeline_inference_cpu(config, tensor_active, inference_params, dense_input, rows, - embeddingvecs, embedding_table_slot_size, embeddings, - network, cpu_resource); - } -} - -template void create_pipeline_inference_cpu( - const nlohmann::json& config, std::map tensor_active, - const InferenceParams& inference_params, Tensor2& dense_input, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embeddings, NetworkCPU** network, - const std::shared_ptr& cpu_resource); -template void create_pipeline_inference_cpu<__half>( - const nlohmann::json& config, std::map tensor_active, - const InferenceParams& inference_params, Tensor2& dense_input, - std::vector>>& rows, - std::vector>>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embeddings, NetworkCPU** network, - const std::shared_ptr& cpu_resource); - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/embedding_feature_combiner_cpu.cpp b/HugeCTR/src/cpu/embedding_feature_combiner_cpu.cpp deleted file mode 100644 index 513254cee5..0000000000 --- a/HugeCTR/src/cpu/embedding_feature_combiner_cpu.cpp +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void embedding_feature_combine_cpu(const float* input, TypeEmbedding* output, const int* row_ptrs, - int batch_size, int slot_num, int embedding_vec_size, - EmbeddingFeatureCombiner_t combiner_type) { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < slot_num; j++) { - int feature_row_index = i * slot_num + j; - int row_offset = row_ptrs[feature_row_index]; // row offset within input - int feature_num = - row_ptrs[feature_row_index + 1] - row_offset; // num of feature vectors in one slot - - for (int k = 0; k < embedding_vec_size; k++) { - float tmp = 0.0f; - for (int l = 0; l < feature_num; l++) { - tmp += input[(row_offset + l) * embedding_vec_size + k]; - } // end for l - if (combiner_type == EmbeddingFeatureCombiner_t::Mean) tmp /= feature_num; - output[feature_row_index * embedding_vec_size + k] = tmp; - } // end for k - } // end for j - } // end for i -} - -template <> -void embedding_feature_combine_cpu(const float* input, __half* output, const int* row_ptrs, - int batch_size, int slot_num, int embedding_vec_size, - EmbeddingFeatureCombiner_t combiner_type) { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < slot_num; j++) { - int feature_row_index = i * slot_num + j; - int row_offset = row_ptrs[feature_row_index]; // row offset within input - int feature_num = - row_ptrs[feature_row_index + 1] - row_offset; // num of feature vectors in one slot - - for (int k = 0; k < embedding_vec_size; k++) { - float tmp = 0.0f; - for (int l = 0; l < feature_num; l++) { - tmp += __half2float(input[(row_offset + l) * embedding_vec_size + k]); - } // end for l - if (combiner_type == EmbeddingFeatureCombiner_t::Mean && feature_num > 1) { - tmp /= feature_num; - } - output[feature_row_index * embedding_vec_size + k] = __float2half(tmp); - } // end for k - } // end for j - } // end for i -} - -} // end of namespace - -template -EmbeddingFeatureCombinerCPU::EmbeddingFeatureCombinerCPU( - const std::shared_ptr>& in_tensor, - const std::shared_ptr>& row_ptrs_tensor, Tensor2& out_tensor, - int batch_size, int slot_num, EmbeddingFeatureCombiner_t combiner_type, - const std::shared_ptr>& blobs_buff) - : LayerCPU(), batch_size_(batch_size), slot_num_(slot_num), combiner_type_(combiner_type) { - try { - // error input checking - const auto& in_dims = in_tensor->get_dimensions(); - const auto& row_ptrs_dims = row_ptrs_tensor->get_dimensions(); - if ((int)in_dims.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "The input tensor must be 2D"); - } - for (auto i : in_dims) { - if (i == 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "The input dims can not be 0"); - } - } - - if ((int)row_ptrs_dims.size() != 1) { - HCTR_OWN_THROW(Error_t::WrongInput, "The row pointers tensor must be 1D"); - } - if ((int)row_ptrs_dims[0] != batch_size * slot_num + 1) { - HCTR_OWN_THROW(Error_t::WrongInput, - "The dimension of row pointers tensor mismatch number of samples"); - } - - embedding_vec_size_ = in_dims[1]; - std::vector out_dims{static_cast(batch_size_), static_cast(slot_num_), - static_cast(embedding_vec_size_)}; - blobs_buff->reserve(out_dims, &out_tensor); - out_tensors_.push_back(out_tensor); - in_tensors_.push_back(in_tensor); - row_ptrs_tensors_.push_back(row_ptrs_tensor); - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void EmbeddingFeatureCombinerCPU::fprop(bool is_train) { - if (is_train) { - HCTR_OWN_THROW(Error_t::IllegalCall, - "The fprop() of EmbeddingFeatureCombiner should only be used for inference"); - } - - float* input = in_tensors_[0]->get_ptr(); - TypeEmbedding* output = out_tensors_[0].get_ptr(); - int* row_ptrs = row_ptrs_tensors_[0]->get_ptr(); - - auto in_dims = in_tensors_[0]->get_dimensions(); - auto out_dims = out_tensors_[0].get_dimensions(); - embedding_feature_combine_cpu(input, output, row_ptrs, batch_size_, slot_num_, - embedding_vec_size_, combiner_type_); -} - -template class EmbeddingFeatureCombinerCPU; -template class EmbeddingFeatureCombinerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/inference_session_cpu.cpp b/HugeCTR/src/cpu/inference_session_cpu.cpp deleted file mode 100644 index 23a5eb4d9f..0000000000 --- a/HugeCTR/src/cpu/inference_session_cpu.cpp +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace HugeCTR { - -template -InferenceSessionCPU::InferenceSessionCPU( - const std::string& model_config_path, const InferenceParams& inference_params, - const std::shared_ptr& parameter_server) - : config_(read_json_file(model_config_path)), - embedding_table_slot_size_({0}), - parameter_server_(parameter_server), - inference_parser_(config_), - inference_params_(inference_params) { - try { - cpu_resource_.reset(new CPUResource(0, {})); - NetworkCPU* network_ptr; - std::map tensor_active; - - // create pipeline and initialize network - create_pipeline_cpu(config_, tensor_active, inference_params_, dense_input_tensor_, - row_ptrs_tensors_, embedding_features_tensors_, embedding_table_slot_size_, - &embedding_feature_combiners_, &network_ptr, cpu_resource_); - network_ = std::move(std::unique_ptr(network_ptr)); - network_->initialize(); - if (inference_params_.dense_model_file.size() > 0) { - network_->load_params_from_model(inference_params_.dense_model_file); - } - - // allocate memory for embedding vector lookup - // h_keys_ is a void pointer, which serves key types of both long long and unsigned int - h_keys_ = malloc(inference_params_.max_batchsize * - inference_parser_.max_feature_num_per_sample * sizeof(long long)); - h_embedding_vectors_ = - (float*)malloc(inference_params_.max_batchsize * - inference_parser_.max_embedding_vector_size_per_sample * sizeof(float)); - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -InferenceSessionCPU::~InferenceSessionCPU() { - free(h_embedding_vectors_); - free(h_keys_); -} - -template -void InferenceSessionCPU::predict(float* h_dense, void* h_embeddingcolumns, - int* h_row_ptrs, float* h_output, int num_samples) { - size_t num_embedding_tables = inference_parser_.num_embedding_tables; - if (num_embedding_tables != row_ptrs_tensors_.size() || - num_embedding_tables != embedding_features_tensors_.size() || - num_embedding_tables != embedding_feature_combiners_.size()) { - HCTR_OWN_THROW(Error_t::IllegalCall, "embedding feature combiner inconsistent"); - } - - // Redistribute keys :from sample first to table first - if (inference_params_.i64_input_key) { - distribute_keys_per_table(static_cast(h_keys_), - static_cast(h_embeddingcolumns), h_row_ptrs, num_samples, - inference_parser_.slot_num_for_tables); - } else { - distribute_keys_per_table(static_cast(h_keys_), - static_cast(h_embeddingcolumns), h_row_ptrs, - num_samples, inference_parser_.slot_num_for_tables); - } - - // parameter server lookup - size_t acc_vectors_offset{0}; - size_t acc_row_ptrs_offset{0}; - size_t acc_keys_offset{0}; - size_t num_keys{0}; - for (size_t i = 0; i < num_embedding_tables; ++i) { - acc_row_ptrs_offset += num_samples * inference_parser_.slot_num_for_tables[i] + 1; - num_keys = h_row_ptrs[acc_row_ptrs_offset - 1]; - if (inference_params_.i64_input_key) { - parameter_server_->lookup(static_cast(h_keys_) + acc_keys_offset, num_keys, - h_embedding_vectors_ + acc_vectors_offset, - inference_params_.model_name, i); - } else { - parameter_server_->lookup(static_cast(h_keys_) + acc_keys_offset, - num_keys, h_embedding_vectors_ + acc_vectors_offset, - inference_params_.model_name, i); - } - acc_keys_offset += num_keys; - acc_vectors_offset += inference_params_.max_batchsize * - inference_parser_.max_feature_num_for_tables[i] * - inference_parser_.embed_vec_size_for_tables[i]; - } - - // copy dense input to dense tensor - auto dense_dims = dense_input_tensor_.get_dimensions(); - size_t dense_size = 1; - for (auto dim : dense_dims) { - dense_size *= dim; - } - size_t dense_size_in_bytes = dense_size * sizeof(float); - memcpy(dense_input_tensor_.get_ptr(), h_dense, dense_size_in_bytes); - - acc_vectors_offset = 0; - acc_row_ptrs_offset = 0; - for (size_t i = 0; i < num_embedding_tables; ++i) { - // bind row ptrs input to row ptrs tensor - auto row_ptrs_dims = row_ptrs_tensors_[i]->get_dimensions(); - std::shared_ptr row_ptrs_buff = - PreallocatedBuffer2::create(h_row_ptrs + acc_row_ptrs_offset, row_ptrs_dims); - bind_tensor_to_buffer(row_ptrs_dims, row_ptrs_buff, row_ptrs_tensors_[i]); - acc_row_ptrs_offset += num_samples * inference_parser_.slot_num_for_tables[i] + 1; - - // bind embedding vectors from looking up to embedding features tensor - auto embedding_features_dims = embedding_features_tensors_[i]->get_dimensions(); - std::shared_ptr embeddding_features_buff = PreallocatedBuffer2::create( - h_embedding_vectors_ + acc_vectors_offset, embedding_features_dims); - bind_tensor_to_buffer(embedding_features_dims, embeddding_features_buff, - embedding_features_tensors_[i]); - acc_vectors_offset += inference_params_.max_batchsize * - inference_parser_.max_feature_num_for_tables[i] * - inference_parser_.embed_vec_size_for_tables[i]; - // feature combiner feedforward - embedding_feature_combiners_[i]->fprop(false); - } - - // dense network feedforward - network_->predict(); - - // copy the prediction result to output - float* h_pred = network_->get_pred_tensor().get_ptr(); - memcpy(h_output, h_pred, network_->get_pred_tensor().get_num_elements() * sizeof(float)); -} - -template class InferenceSessionCPU; -template class InferenceSessionCPU; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/add_layer_cpu.cpp b/HugeCTR/src/cpu/layers/add_layer_cpu.cpp deleted file mode 100644 index 56cdc37841..0000000000 --- a/HugeCTR/src/cpu/layers/add_layer_cpu.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void add_cpu(T** input, T* output, size_t size, size_t num) { - for (size_t i = 0; i < size; i++) { - float tmp = 0.f; - for (size_t j = 0; j < num; j++) { - tmp += input[j][i]; - } - output[i] = tmp; - } -} - -template <> -void add_cpu(__half** input, __half* output, size_t size, size_t num) { - for (size_t i = 0; i < size; i++) { - float tmp = 0.f; - for (size_t j = 0; j < num; j++) { - tmp += __half2float(input[j][i]); - } - output[i] = __float2half(tmp); - } -} - -template -void add_dgrad_cpu(const T* top_grad, T** dgrad, size_t size, size_t num) { - for (size_t i = 0; i < size; i++) { - for (size_t j = 0; j < num; j++) { - dgrad[j][i] = top_grad[i]; - } - } -} - -} // end of namespace - -template -AddLayerCPU::AddLayerCPU(const Tensors2& in_tensors, const Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff) - : LayerCPU() { - try { - size_ = in_tensors[0].get_num_elements(); - num_ = in_tensors.size(); - - // error input checking - auto dims = in_tensors[0].get_dimensions(); - if (num_ < 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "AddLayer needs at least 2 input tensors"); - } - for (size_t i = 1; i < num_; i++) { - if (in_tensors[i].get_dimensions().size() != dims.size()) { - HCTR_OWN_THROW(Error_t::WrongInput, "All the input tensors must have the same num of dims"); - } - for (unsigned int j = 0; j < dims.size(); j++) { - if (in_tensors[i].get_dimensions()[j] != dims[j]) { - HCTR_OWN_THROW(Error_t::WrongInput, "All the input tensors must have the same dims"); - } - } - } - - for (size_t i = 0; i < num_; i++) { - in_tensors_.push_back(in_tensors[i]); - } - out_tensors_.push_back(out_tensor); - - blobs_buff->reserve({num_}, &h_inputs_); - - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void AddLayerCPU::initialize() { - for (size_t i = 0; i < num_; i++) { - h_inputs_.get_ptr()[i] = in_tensors_[i].get_ptr(); - } -} - -template -void AddLayerCPU::fprop(bool is_train) { - T* output = out_tensors_[0].get_ptr(); - - add_cpu(h_inputs_.get_ptr(), output, size_, num_); -} - -template <> -void AddLayerCPU<__half>::fprop(bool is_train) { - __half* output = out_tensors_[0].get_ptr(); - - add_cpu(h_inputs_.get_ptr(), output, size_, num_); -} - -template -void AddLayerCPU::bprop() {} - -template <> -void AddLayerCPU<__half>::bprop() {} - -template class AddLayerCPU; -template class AddLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp b/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp deleted file mode 100644 index 85fe14dfbd..0000000000 --- a/HugeCTR/src/cpu/layers/batch_norm_layer_cpu.cpp +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -constexpr float eps = 1e-4; // Epsilon for CPU computation - -template -void batch_norm_fprop_cpu(const float* gamma, const float* beta, const T* in, T* out, - int batch_size, int num_feature) { - for (int j = 0; j < num_feature; j++) { - float mean = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - mean += in[idx]; - } - mean /= batch_size; - - float var = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - float diff = in[idx] - mean; - var += (diff * diff); - } - var /= batch_size; - - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - float in_norm = (in[idx] - mean) / sqrt(var + eps); - out[idx] = gamma[j] * in_norm + beta[j]; - } - } -} - -template <> -void batch_norm_fprop_cpu<__half>(const float* gamma, const float* beta, const __half* in, - __half* out, int batch_size, int num_feature) { - for (int j = 0; j < num_feature; j++) { - float mean = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - mean += __half2float(in[idx]); - } - mean /= batch_size; - - float var = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - float diff = __half2float(in[idx]) - mean; - var += (diff * diff); - } - var /= batch_size; - - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - float in_norm = (__half2float(in[idx]) - mean) / sqrt(var + eps); - out[idx] = __float2half(gamma[j] * in_norm + beta[j]); - } - } -} - -template -void batch_norm_bprop_cpu(const float* gamma, const T* out, T* in, int batch_size, - int num_feature) { - for (int j = 0; j < num_feature; j++) { - float mean = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - mean += in[idx]; - } - mean /= batch_size; - - float var = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - float diff = in[idx] - mean; - var += (diff * diff); - } - var /= batch_size; - - float inv_std = 1.0f / sqrt(var + eps); - - float d_var = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - float val = (out[idx] * gamma[j]) * (in[idx] - mean); - d_var += val; - } - d_var *= (-0.5f) * pow(inv_std, 3); - - float val1 = 0.0f; - float val2 = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - val1 += (out[idx] * gamma[j]); - val2 += (in[idx] - mean); - } - val1 *= (-inv_std); - val2 *= (d_var / batch_size) * -2; - float d_mean = (val1 + val2); - - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - in[idx] = (out[idx] * gamma[j]) * inv_std + d_var * (2.0 / batch_size) * (in[idx] - mean) + - d_mean / batch_size; - } - } -} - -template <> -void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half* in, int batch_size, - int num_feature) { - for (int j = 0; j < num_feature; j++) { - float mean = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - mean += __half2float(in[idx]); - } - mean /= batch_size; - - float var = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - float diff = __half2float(in[idx]) - mean; - var += (diff * diff); - } - var /= batch_size; - - float inv_std = 1.0f / sqrt(var + eps); - - float d_var = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - float val = (__half2float(out[idx]) * gamma[j]) * (__half2float(in[idx]) - mean); - d_var += val; - } - d_var *= (-0.5f) * pow(inv_std, 3); - - float val1 = 0.0f; - float val2 = 0.0f; - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - val1 += (__half2float(out[idx]) * gamma[j]); - val2 += (__half2float(in[idx]) - mean); - } - val1 *= (-inv_std); - val2 *= (d_var / batch_size) * -2; - float d_mean = (val1 + val2); - - for (int i = 0; i < batch_size; i++) { - int idx = i * num_feature + j; - in[idx] = __float2half((__half2float(out[idx]) * gamma[j]) * inv_std + - d_var * (2.0 / batch_size) * (__half2float(in[idx]) - mean) + - d_mean / batch_size); - } - } -} - -} // end namespace - -template -BatchNormLayerCPU::BatchNormLayerCPU( - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blob_buff, const Tensor2& in_tensor, - const Tensor2& out_tensor, const Params& params) - : LayerCPU(), params_(params) { - const auto& in_tensor_dim = in_tensor.get_dimensions(); - - size_t num_feature = in_tensor_dim[1]; - - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); - - std::vector gamma_dim = {num_feature, 1}; - - // gamma & beta - weight_buff->reserve(gamma_dim, &gamma_); - weight_buff->reserve(gamma_dim, &beta_); - weights_.push_back(gamma_); - weights_.push_back(beta_); - - // gamma grad & beta grad - wgrad_buff->reserve(gamma_dim, &gamma_grad_); - wgrad_buff->reserve(gamma_dim, &beta_grad_); - wgrad_.push_back(gamma_grad_); - wgrad_.push_back(beta_grad_); -} - -template -BatchNormLayerCPU::~BatchNormLayerCPU() {} - -template -void BatchNormLayerCPU::initialize() {} - -template -void BatchNormLayerCPU::fprop(bool is_train) { - int batch_size = in_tensors_[0].get_dimensions()[0]; - int num_feature = in_tensors_[0].get_dimensions()[1]; - - Tensor2& in_tensor = in_tensors_[0]; - Tensor2& out_tensor = out_tensors_[0]; - T* in = in_tensor.get_ptr(); - T* out = out_tensor.get_ptr(); - - float* gamma = gamma_.get_ptr(); - float* beta = beta_.get_ptr(); - - batch_norm_fprop_cpu(gamma, beta, in, out, batch_size, num_feature); -} - -template -void BatchNormLayerCPU::bprop() {} - -template class BatchNormLayerCPU; -template class BatchNormLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/cast_layer_cpu.cpp b/HugeCTR/src/cpu/layers/cast_layer_cpu.cpp deleted file mode 100644 index 98c7513908..0000000000 --- a/HugeCTR/src/cpu/layers/cast_layer_cpu.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -namespace HugeCTR { - -namespace { - -void cast_cpu(__half* top, const float* bottom, int len) { - for (int i = 0; i < len; ++i) { - top[i] = __float2half(bottom[i]); - } -} - -void cast_cpu(float* top, const __half* bottom, int len) { - for (int i = 0; i < len; ++i) { - top[i] = __half2float(bottom[i]); - } -} - -} // namespace - -template -CastLayerCPU::CastLayerCPU(const Tensor2& bottom_tensor, - const Tensor2& top_tensor) - : LayerCPU() { - assert(bottom_tensor.get_num_elements() == top_tensor.get_num_elements()); - - bottom_tensor_ = bottom_tensor; - top_tensor_ = top_tensor; -} - -template -void CastLayerCPU::fprop(bool is_train) { - const From* bottom = bottom_tensor_.get_ptr(); - To* top = top_tensor_.get_ptr(); - int len = bottom_tensor_.get_num_elements(); - cast_cpu(top, bottom, len); -} - -template -void CastLayerCPU::bprop() {} - -template class CastLayerCPU; -template class CastLayerCPU<__half, float>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/concat_layer_cpu.cpp b/HugeCTR/src/cpu/layers/concat_layer_cpu.cpp deleted file mode 100644 index 718a3a066d..0000000000 --- a/HugeCTR/src/cpu/layers/concat_layer_cpu.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void concat_cpu(T** input, T* output, size_t height, size_t new_width, int n_ins, - const std::vector& widths) { - for (size_t r = 0; r < height; r++) { - for (size_t c = 0; c < new_width; c++) { - int out_idx = r * new_width + c; - int in_no = 0; - int c2 = c; - size_t accum_width = 0; - for (int k = 0; k < n_ins; k++) { - if (c < accum_width + widths[k]) { - in_no = k; - c2 -= accum_width; - break; - } - accum_width += widths[k]; - } - int in_idx = r * widths[in_no] + c2; - output[out_idx] = input[in_no][in_idx]; - } - } -} - -} // anonymous namespace - -template -ConcatLayerCPU::ConcatLayerCPU(const Tensors2& in_tensors, Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff) - : LayerCPU() { - try { - if (in_tensors.empty()) { - HCTR_OWN_THROW(Error_t::WrongInput, "Empty input tensors"); - } - - int n_in_tensors = in_tensors.size(); - size_t height = 0; - size_t new_width = 0; - for (int i = 0; i < n_in_tensors; i++) { - auto cur_in_dims = in_tensors[i].get_dimensions(); - if (i != 0) { - auto first_in_dims = in_tensors[0].get_dimensions(); - if (cur_in_dims[0] != first_in_dims[0]) { - HCTR_OWN_THROW(Error_t::WrongInput, "All the input tensors must have the same height"); - } - } - if (cur_in_dims.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "Only 2D tensors can be concatenated"); - } - if (i == 0) { - height = cur_in_dims[0]; - } - new_width += cur_in_dims[1]; - } - - std::vector out_dims = {height, new_width}; - blobs_buff->reserve(out_dims, &out_tensor); - - for (const Tensor2& in_tensor : in_tensors) { - in_tensors_.push_back(in_tensor); - } - out_tensor_ = out_tensor; - - blobs_buff->reserve({in_tensors.size()}, &h_inputs_); - - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void ConcatLayerCPU::fprop(bool is_train) { - size_t height = out_tensor_.get_dimensions()[0]; - int n_ins = in_tensors_.size(); - std::vector widths; - size_t new_width = 0; - for (const Tensor2& in_tensor : in_tensors_) { - widths.push_back(in_tensor.get_dimensions()[1]); - new_width += in_tensor.get_dimensions()[1]; - } - // fprop - T* output = out_tensor_.get_ptr(); - for (size_t i = 0; i < in_tensors_.size(); i++) { - h_inputs_.get_ptr()[i] = in_tensors_[i].get_ptr(); - } - concat_cpu(h_inputs_.get_ptr(), output, height, new_width, n_ins, widths); -} - -template -void ConcatLayerCPU::bprop() {} - -template class ConcatLayerCPU; -template class ConcatLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/dropout_layer_cpu.cpp b/HugeCTR/src/cpu/layers/dropout_layer_cpu.cpp deleted file mode 100644 index 207c1811dc..0000000000 --- a/HugeCTR/src/cpu/layers/dropout_layer_cpu.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void dropout_cpu(T* input, T* output, float* mask, float rate, float scale, size_t num, - bool is_train) { - for (size_t i = 0; i < num; i++) { - output[i] = is_train ? ((1.f - mask[i]) >= rate) * input[i] * scale : input[i]; - } -} - -template <> -void dropout_cpu(__half* input, __half* output, float* mask, float rate, float scale, size_t num, - bool is_train) { - for (size_t i = 0; i < num; i++) { - output[i] = is_train ? __float2half(((1.f - mask[i]) >= rate) * __half2float(input[i]) * scale) - : input[i]; - } -} - -} // end namespace - -template -DropoutLayerCPU::DropoutLayerCPU(const Tensor2& in_tensor, const Tensor2& out_tensor, - const std::shared_ptr> blobs_buff, - float rate) - : LayerCPU(), rate_(rate), scale_(1.0 / (1.0 - rate + 1e-6)) { - assert(in_tensor.get_num_elements() == out_tensor.get_num_elements()); - assert(rate_ > 0.f && rate_ < 1.f); - - in_tensors_.emplace_back(in_tensor); - out_tensors_.emplace_back(out_tensor); - - blobs_buff->reserve(in_tensor.get_dimensions(), &mask_); -} - -template -void DropoutLayerCPU::fprop(bool is_train) { - FloatUniformDataSimulator ldata_sim(0.f, 1.f); - size_t num = 1; - for (auto dim : in_tensors_[0].get_dimensions()) { - num *= dim; - } - float* h_mask = mask_.get_ptr(); - for (size_t i = 0; i < num; i++) { - h_mask[i] = ldata_sim.get_num(); - } - T* input = in_tensors_[0].get_ptr(); - T* output = out_tensors_[0].get_ptr(); - dropout_cpu(input, output, h_mask, rate_, scale_, num, is_train); -} - -template -void DropoutLayerCPU::bprop() {} - -template class DropoutLayerCPU; -template class DropoutLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/elementwise_multiply_layer_cpu.cpp b/HugeCTR/src/cpu/layers/elementwise_multiply_layer_cpu.cpp deleted file mode 100644 index a7b3a58f32..0000000000 --- a/HugeCTR/src/cpu/layers/elementwise_multiply_layer_cpu.cpp +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void elementwise_multiply_cpu(T** input, T* output, size_t size, size_t num) { - T one = 1.0; - - for (size_t i = 0; i < size; i++) { - T tmp = one; - for (size_t j = 0; j < num; j++) { - tmp = tmp * input[j][i]; - } - output[i] = tmp; - } -} - -template -void elementwise_multiply_dgrad_cpu(const T* top_grad, T** dgrad, const T* fprop_output, - size_t size, size_t num) { - T zero = 0.0; - - for (size_t i = 0; i < size; i++) { - for (size_t j = 0; j < num; j++) { - if (0 == fprop_output[i]) { - dgrad[j][i] = zero; - } else { - T d_input = dgrad[j][i]; - dgrad[j][i] = top_grad[i] * T(fprop_output[i] / d_input); - } - } - } -} - -} // end of namespace - -template -ElementwiseMultiplyLayerCPU::ElementwiseMultiplyLayerCPU( - const Tensors2& in_tensors, const Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff) - : LayerCPU() { - try { - size_ = in_tensors[0].get_num_elements(); - num_ = in_tensors.size(); - - // error input checking - auto dims = in_tensors[0].get_dimensions(); - if (num_ < 2) { - HCTR_OWN_THROW(Error_t::WrongInput, - "ElementwiseMultiplyLayer needs at least 2 input tensors"); - } - for (size_t i = 1; i < num_; i++) { - if (in_tensors[i].get_dimensions().size() != dims.size()) { - HCTR_OWN_THROW(Error_t::WrongInput, "All the input tensors must have the same num of dims"); - } - for (unsigned int j = 0; j < dims.size(); j++) { - if (in_tensors[i].get_dimensions()[j] != dims[j]) { - HCTR_OWN_THROW(Error_t::WrongInput, "All the input tensors must have the same dims"); - } - } - } - - for (size_t i = 0; i < num_; i++) { - in_tensors_.push_back(in_tensors[i]); - } - out_tensors_.push_back(out_tensor); - - blobs_buff->reserve({num_}, &h_inputs_); - blobs_buff->reserve(out_tensor.get_dimensions(), &fprop_output_); - - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void ElementwiseMultiplyLayerCPU::initialize() { - for (size_t i = 0; i < num_; i++) { - h_inputs_.get_ptr()[i] = in_tensors_[i].get_ptr(); - } -} - -template -void ElementwiseMultiplyLayerCPU::fprop(bool is_train) { - if (!initialized_) { - for (size_t i = 0; i < num_; i++) { - h_inputs_.get_ptr()[i] = in_tensors_[i].get_ptr(); - } - initialized_ = true; - } - T* output = out_tensors_[0].get_ptr(); - elementwise_multiply_cpu(h_inputs_.get_ptr(), output, size_, num_); -} - -template -void ElementwiseMultiplyLayerCPU::bprop() {} - -template class ElementwiseMultiplyLayerCPU; -template class ElementwiseMultiplyLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp b/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp deleted file mode 100644 index 891de22d61..0000000000 --- a/HugeCTR/src/cpu/layers/elu_layer_cpu.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void elu_cpu(const T* in, T* out, int len, T alpha) { - for (int i = 0; i < len; ++i) { - out[i] = - (__half2float(in[i]) < 0) ? T(__half2float(alpha) * (exp(__half2float(in[i])) - 1)) : in[i]; - } -} - -template -void elu_bprop_cpu(const T* d_out, T* d_in, int len, T alpha) { - for (int i = 0; i < len; ++i) { - d_in[i] = (d_in[i] < 0) ? T(alpha * exp(d_in[i]) * d_out[i]) : d_out[i]; - } -} - -} // end namespace - -template -EluLayerCPU::EluLayerCPU(const Tensor2& in_tensor, const Tensor2& out_tensor, T alpha) - : LayerCPU(), alpha_(alpha) { - assert(in_tensor.get_num_elements() == out_tensor.get_num_elements()); - - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); -} - -template -void EluLayerCPU::fprop(bool is_train) { - const Tensor2& in_tensor = in_tensors_[0]; - Tensor2& out_tensor = out_tensors_[0]; - - const int len = in_tensor.get_num_elements(); - - T alpha = alpha_; - - elu_cpu(in_tensor.get_ptr(), out_tensor.get_ptr(), len, alpha); -} - -template -void EluLayerCPU::bprop() {} - -template class EluLayerCPU; -template class EluLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp b/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp deleted file mode 100644 index 0293af5f0e..0000000000 --- a/HugeCTR/src/cpu/layers/fm_order2_layer_cpu.cpp +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -namespace HugeCTR { - -namespace { - -inline float trunc_half(float a) { return __half2float(__float2half(a)); } - -void fm_order2_fprop_cpu(const float* in, float* out, int batch_size, int slot_num, - int emb_vec_size) { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < emb_vec_size; j++) { - float sum = 0.0f; - float square_sum = 0.0f; - int offset = i * slot_num * emb_vec_size + j; - for (int k = 0; k < slot_num; k++) { - int index = offset + k * emb_vec_size; - float input = in[index]; - sum += input; - square_sum += input * input; - } - float sum_square = sum * sum; - out[i * emb_vec_size + j] = 0.5f * (sum_square - square_sum); - } - } -} - -void fm_order2_fprop_cpu(const __half* in, __half* out, int batch_size, int slot_num, - int emb_vec_size) { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < emb_vec_size; j++) { - float sum = 0.0f; - float square_sum = 0.0f; - int offset = i * slot_num * emb_vec_size + j; - for (int k = 0; k < slot_num; k++) { - int index = offset + k * emb_vec_size; - float input = __half2float(in[index]); - sum = trunc_half(sum + input); - square_sum = trunc_half(square_sum + input * input); - } - float sum_square = trunc_half(sum * sum); - out[i * emb_vec_size + j] = __float2half(0.5f * (sum_square - square_sum)); - } - } -} - -void fm_order2_bprop_cpu(const float* in, const float* top_grad, float* dgrad, int batch_size, - int slot_num, int emb_vec_size) { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < emb_vec_size; j++) { - float sum = 0.0f; - int offset = i * slot_num * emb_vec_size + j; - for (int k = 0; k < slot_num; k++) { - int index = offset + k * emb_vec_size; - sum += in[index]; - } - for (int k = 0; k < slot_num; k++) { - int index = offset + k * emb_vec_size; - dgrad[index] = top_grad[i * emb_vec_size + j] * (sum - in[index]); - } - } - } -} - -void fm_order2_bprop_cpu(const __half* in, const __half* top_grad, __half* dgrad, int batch_size, - int slot_num, int emb_vec_size) { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < emb_vec_size; j++) { - float sum = 0.0f; - int offset = i * slot_num * emb_vec_size + j; - for (int k = 0; k < slot_num; k++) { - int index = offset + k * emb_vec_size; - sum = trunc_half(sum + __half2float(in[index])); - } - for (int k = 0; k < slot_num; k++) { - int index = offset + k * emb_vec_size; - dgrad[index] = __float2half(__half2float(top_grad[i * emb_vec_size + j]) * - (sum - __half2float(in[index]))); - } - } - } -} - -} // end of namespace - -template -FmOrder2LayerCPU::FmOrder2LayerCPU(const Tensor2& in_tensor, const Tensor2& out_tensor) - : LayerCPU() { - try { - const auto& in_dims = in_tensor.get_dimensions(); - if (in_dims.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "only 2D tensors can be used as input for FmOrder2Layer"); - } - const auto& out_dims = out_tensor.get_dimensions(); - if (out_dims.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, - "only 2D tensors can be used as output for FmOrder2Layer"); - } - if ((in_dims[1] % out_dims[1]) != 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "(in_dims[1] % out_dims[1]) != 0"); - } - - batch_size_ = in_dims[0]; - slot_num_ = in_dims[1] / out_dims[1]; - embedding_vec_size_ = out_dims[1]; - - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); - - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void FmOrder2LayerCPU::fprop(bool is_train) { - const T* in = in_tensors_[0].get_ptr(); - T* out = out_tensors_[0].get_ptr(); - fm_order2_fprop_cpu(in, out, batch_size_, slot_num_, embedding_vec_size_); -} - -template -void FmOrder2LayerCPU::bprop() { - T* in = in_tensors_[0].get_ptr(); - const T* out = out_tensors_[0].get_ptr(); - fm_order2_bprop_cpu(in, out, in, batch_size_, slot_num_, embedding_vec_size_); -} - -template class FmOrder2LayerCPU; -template class FmOrder2LayerCPU<__half>; - -} // end of namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/fully_connected_layer_cpu.cpp b/HugeCTR/src/cpu/layers/fully_connected_layer_cpu.cpp deleted file mode 100644 index 0f494339fa..0000000000 --- a/HugeCTR/src/cpu/layers/fully_connected_layer_cpu.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -namespace HugeCTR { - -namespace { - -void cpu_mm(float* a, float* b, float* c, int m, int k, int n) { - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - c[i * n + j] = 0.0f; - for (int kk = 0; kk < k; ++kk) c[i * n + j] += a[i * k + kk] * b[kk * n + j]; - } - } -} - -void cpu_add_bias(float* out, float* bias, int m, int n) { - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - out[i * n + j] += bias[j]; - } - } -} - -void transpose(float* a, int m, int n) { - std::unique_ptr tmp(new float[m * n]); - for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) tmp[j * m + i] = a[i * n + j]; - for (int i = 0; i < m * n; ++i) a[i] = tmp[i]; -} - -} // end namespace - -FullyConnectedLayerCPU::FullyConnectedLayerCPU( - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, const Tensor2& in_tensor, - const Tensor2& out_tensor, bool use_mixed_precision) - : LayerCPU(), use_mixed_precision_(use_mixed_precision) { - try { - // check the in_tensor and out_tensor - const auto& in_tensor_dim = in_tensor.get_dimensions(); - const auto& out_tensor_dim = out_tensor.get_dimensions(); - // 1. two dim? - if (in_tensor_dim.size() != 2 || out_tensor_dim.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor doesn't has two dimensions"); - } - // 2. dim match? - size_t m = in_tensor_dim[0]; - size_t n = out_tensor_dim[1]; - size_t k = in_tensor_dim[1]; - size_t m_ck = out_tensor_dim[0]; - if (m != m_ck) { - HCTR_OWN_THROW(Error_t::WrongInput, "size of input / output tensor doesn't match"); - } - - std::vector weight_dim = {k, n}; - std::vector bias_dim = {1, n}; - - { - Tensor2 tensor; - weight_buff->reserve(weight_dim, &tensor); - weights_.push_back(tensor); - } - { - Tensor2 tensor; - weight_buff->reserve(bias_dim, &tensor); - weights_.push_back(tensor); - } - { - Tensor2 tensor; - wgrad_buff->reserve(weight_dim, &tensor); - wgrad_.push_back(tensor); - } - { - Tensor2 tensor; - wgrad_buff->reserve(bias_dim, &tensor); - wgrad_.push_back(tensor); - } - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); - // Where should we create this cuBLAS handle? - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -void FullyConnectedLayerCPU::fprop(bool is_train) { - Tensor2& in_tensor = get_in_tensors(is_train)[0]; - Tensor2& out_tensor = out_tensors_[0]; - - float* weight = weights_[0].get_ptr(); - float* bias = weights_[1].get_ptr(); - float* in = in_tensor.get_ptr(); - float* out = out_tensor.get_ptr(); - - const auto& in_tensor_dim = in_tensor.get_dimensions(); - const auto& out_tensor_dim = out_tensor.get_dimensions(); - - int m, n, k; - - m = in_tensor_dim[0]; - n = out_tensor_dim[1]; - k = in_tensor_dim[1]; - - cpu_mm(in, weight, out, m, k, n); - cpu_add_bias(out, bias, m, n); -} - -void FullyConnectedLayerCPU::bprop() {} - -template class FullyConnectedLayerCPU; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp b/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp deleted file mode 100644 index 1b2566327b..0000000000 --- a/HugeCTR/src/cpu/layers/fully_connected_layer_half_cpu.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -namespace HugeCTR { - -namespace { - -void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool transpose_b, int m, - int k, int n) { - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - float sum = 0.0f; - for (int kk = 0; kk < k; ++kk) { - int ai = transpose_a ? kk * m + i : i * k + kk; - int bi = transpose_b ? j * k + kk : kk * n + j; - sum += __half2float(a[ai] * b[bi]); - } - c[i * n + j] = sum; - } - } -} - -void cpu_add_bias(__half* top, const __half* bias, int m, int n) { - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - top[i * n + j] = top[i * n + j] + bias[j]; - } - } -} - -void cpu_reverse_add_bias(__half* bias_grad, const __half* top, int m, int n) { - for (int i = 0; i < n; ++i) { - float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]); - bias_grad[i] = sum; - } -} - -} // end namespace - -FullyConnectedLayerCPU<__half>::FullyConnectedLayerCPU( - const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor) - : LayerCPU() { - const auto& bottom_tensor_dim = bottom_tensor.get_dimensions(); - const auto& top_tensor_dim = top_tensor.get_dimensions(); - - if (bottom_tensor_dim.size() != 2 || top_tensor_dim.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor doesn't has two dimensions"); - } - - size_t m = bottom_tensor_dim[0]; - size_t n = top_tensor_dim[1]; - size_t k = bottom_tensor_dim[1]; - - std::vector kernel_dim = {k, n}; - std::vector bias_dim = {1, n}; - std::vector identity_dim = {1, m}; - - { - Tensor2 tensor; - master_weights_buff->reserve(kernel_dim, &tensor); - weights_.push_back(tensor); - } - { - Tensor2 tensor; - master_weights_buff->reserve(bias_dim, &tensor); - weights_.push_back(tensor); - } - { - Tensor2<__half> tensor; - weights_buff->reserve(kernel_dim, &tensor); - weights_half_.push_back(tensor); - } - { - Tensor2<__half> tensor; - weights_buff->reserve(bias_dim, &tensor); - weights_half_.push_back(tensor); - } - { - Tensor2<__half> tensor; - weights_grad_buff->reserve(kernel_dim, &tensor); - weights_grad_.push_back(tensor); - } - { - Tensor2<__half> tensor; - weights_grad_buff->reserve(bias_dim, &tensor); - weights_grad_.push_back(tensor); - } - blobs_buff->reserve(identity_dim, &identity_tensor_); - - bottom_tensor_ = bottom_tensor; - top_tensor_ = top_tensor; -} - -void FullyConnectedLayerCPU<__half>::fprop(bool is_train) { - const __half* kernel = weights_half_[0].get_ptr(); - const __half* bias = weights_half_[1].get_ptr(); - const __half* bottom = get_bottom_tensor(is_train).get_ptr(); - __half* top = top_tensor_.get_ptr(); - - const auto& bottom_tensor_dim = get_bottom_tensor(is_train).get_dimensions(); - const auto& top_tensor_dim = top_tensor_.get_dimensions(); - - size_t m = bottom_tensor_dim[0]; - size_t n = top_tensor_dim[1]; - size_t k = bottom_tensor_dim[1]; - - cpu_mm(top, bottom, false, kernel, false, m, k, n); - cpu_add_bias(top, bias, m, n); -} - -void FullyConnectedLayerCPU<__half>::bprop() {} - -template class FullyConnectedLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp b/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp deleted file mode 100644 index d2da98d944..0000000000 --- a/HugeCTR/src/cpu/layers/fused_fully_connected_layer_cpu.cpp +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -namespace HugeCTR { - -namespace { - -void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool transpose_b, int m, - int k, int n) { - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - float sum = 0.0f; - for (int kk = 0; kk < k; ++kk) { - int ai = transpose_a ? kk * m + i : i * k + kk; - int bi = transpose_b ? j * k + kk : kk * n + j; - sum += __half2float(a[ai] * b[bi]); - } - c[i * n + j] = sum; - } - } -} - -void cpu_add_bias_and_re(__half* top, __half* middle, const __half* bias, int m, int n) { - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - __half t = top[i * n + j] + bias[j]; - middle[i * n + j] = t; - top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t; - } - } -} - -void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* middle, const __half* top, int m, - int n) { - for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) { - if (__half2float(middle[i * n + j]) < 0) { - middle[i * n + j] = 0.0f; - } else { - middle[i * n + j] = top[i * n + j]; - } - } - - for (int i = 0; i < n; ++i) { - float sum = 0.0f; - for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]); - bias_grad[i] = sum; - } -} - -} // namespace - -FusedFullyConnectedLayerCPU::FusedFullyConnectedLayerCPU( - const std::shared_ptr>& master_weights_buff, - const std::shared_ptr>& weights_buff, - const std::shared_ptr>& weights_grad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor) - : LayerCPU() { - const auto& bottom_tensor_dim = bottom_tensor.get_dimensions(); - const auto& top_tensor_dim = top_tensor.get_dimensions(); - - if (bottom_tensor_dim.size() != 2 || top_tensor_dim.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor doesn't has two dimensions"); - } - - size_t m = bottom_tensor_dim[0]; - size_t n = top_tensor_dim[1]; - size_t k = bottom_tensor_dim[1]; - - if (m % 32 != 0 || n % 64 != 0) { - HCTR_OWN_THROW( - Error_t::WrongInput, - "The first dimension of bottom tensor must be a multiple of 32, the second dimension " - "of top tensor must be a multiple of 64."); - } - - std::vector kernel_dim = {k, n}; - std::vector bias_dim = {1, n}; - - { - Tensor2 tensor; - master_weights_buff->reserve(kernel_dim, &tensor); - weights_.push_back(tensor); - } - { - Tensor2 tensor; - master_weights_buff->reserve(bias_dim, &tensor); - weights_.push_back(tensor); - } - { - Tensor2<__half> tensor; - weights_buff->reserve(kernel_dim, &tensor); - weights_half_.push_back(tensor); - } - { - Tensor2<__half> tensor; - weights_buff->reserve(bias_dim, &tensor); - weights_half_.push_back(tensor); - } - { - Tensor2<__half> tensor; - weights_grad_buff->reserve(kernel_dim, &tensor); - weights_grad_.push_back(tensor); - } - { - Tensor2<__half> tensor; - weights_grad_buff->reserve(bias_dim, &tensor); - weights_grad_.push_back(tensor); - } - - bottom_tensor_ = bottom_tensor; - top_tensor_ = top_tensor; - blobs_buff->reserve(top_tensor_.get_dimensions(), &middle_tensor_); - blobs_buff->reserve(bias_dim, &bias_grad_tensor_); -} - -void FusedFullyConnectedLayerCPU::fprop(bool is_train) { - const __half* kernel = weights_half_[0].get_ptr(); - const __half* bias = weights_half_[1].get_ptr(); - const __half* bottom = get_bottom_tensor(is_train).get_ptr(); - __half* middle = middle_tensor_.get_ptr(); - __half* top = top_tensor_.get_ptr(); - - const auto& bottom_tensor_dim = get_bottom_tensor(is_train).get_dimensions(); - const auto& top_tensor_dim = top_tensor_.get_dimensions(); - - size_t m = bottom_tensor_dim[0]; - size_t n = top_tensor_dim[1]; - size_t k = bottom_tensor_dim[1]; - - cpu_mm(top, bottom, false, kernel, false, m, k, n); - cpu_add_bias_and_re(top, middle, bias, m, n); -} - -void FusedFullyConnectedLayerCPU::bprop() {} - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp b/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp deleted file mode 100644 index 74e3ea977f..0000000000 --- a/HugeCTR/src/cpu/layers/interaction_layer_cpu.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -struct Log2 { - static constexpr uint value = 1 + Log2::value; -}; -template <> -struct Log2<1> { - static constexpr uint value = 0; -}; - -struct __align__(8) half4 { - half2 vals[2]; -}; - -template -void concat_cpu(size_t height, size_t in_width, size_t out_width, size_t n_ins, size_t n_emb, - bool fprop, T *h_concat, T *h_in_mlp, T *h_in_emb) { - for (size_t ni = 0; ni < n_ins; ni++) { - for (size_t h = 0; h < height; h++) { - size_t in_idx_base = (ni == 0) ? h * in_width : h * in_width * n_emb; - for (size_t w = 0; w < in_width; w++) { - size_t in_idx = in_idx_base + w; - size_t out_idx = h * out_width + ni * in_width + w; - if (fprop) { - h_concat[out_idx] = (ni == 0) ? h_in_mlp[in_idx] : h_in_emb[(ni - 1) * in_width + in_idx]; - } else { - if (ni == 0) { - h_in_mlp[in_idx] = h_in_mlp[in_idx] + h_concat[out_idx]; - } else { - h_in_emb[in_idx + (ni - 1) * in_width] = h_concat[out_idx]; - } - } - } - } - } -} - -template -void matmul_cpu(size_t height, size_t in_width, size_t n_ins, T *h_concat, T *h_mat) { - for (size_t p = 0; p < height; p++) { - size_t concat_stride = n_ins * in_width * p; - size_t mat_stride = n_ins * n_ins * p; - for (size_t m = 0; m < n_ins; m++) { - for (size_t n = 0; n < n_ins; n++) { - float accum = 0.0f; - for (size_t k = 0; k < in_width; k++) { - accum += __half2float(h_concat[concat_stride + m * in_width + k] * - h_concat[concat_stride + n * in_width + k]); - } - h_mat[mat_stride + m * n_ins + n] = accum; - } - } - } -} - -template -void gather_concat_cpu(size_t height, size_t in_width, size_t n_ins, T *h_in_mlp, T *h_mat, - T *h_ref) { - size_t out_len = in_width + (n_ins * (n_ins + 1) / 2 - n_ins) + 1; - for (size_t p = 0; p < height; p++) { - size_t cur_idx = 0; - size_t out_stride = p * out_len; - size_t mat_stride = p * n_ins * n_ins; - for (size_t i = 0; i < in_width; i++) { - h_ref[out_stride + cur_idx++] = h_in_mlp[p * in_width + i]; - } - for (size_t n = 0; n < n_ins; n++) { - for (size_t m = 0; m < n_ins; m++) { - if (n > m) { - h_ref[out_stride + cur_idx++] = h_mat[mat_stride + m * n_ins + n]; - } - } - } - } -} - -} // anonymous namespace - -template -InteractionLayerCPU::InteractionLayerCPU( - const Tensor2 &in_bottom_mlp_tensor, const Tensor2 &in_embeddings, Tensor2 &out_tensor, - const std::shared_ptr> &blobs_buff, bool use_mixed_precision) - : LayerCPU(), use_mixed_precision_(use_mixed_precision) { - try { - auto first_in_dims = in_bottom_mlp_tensor.get_dimensions(); - auto second_in_dims = in_embeddings.get_dimensions(); - - if (first_in_dims.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "Input Bottom MLP must be a 2D tensor"); - } - - if (second_in_dims.size() != 3) { - HCTR_OWN_THROW(Error_t::WrongInput, "Input Embeddings must be a 3D tensor"); - } - - if (first_in_dims[0] != second_in_dims[0]) { - HCTR_OWN_THROW(Error_t::WrongInput, "the input tensors' batch sizes must be the same"); - } - - if (first_in_dims[1] != second_in_dims[2]) { - HCTR_OWN_THROW(Error_t::WrongInput, "the input tensors' widths must be the same"); - } - - size_t n_ins = 1 + second_in_dims[1]; - if (std::is_same::value == false) { - size_t concat_dims_width = first_in_dims[1] + second_in_dims[1] * second_in_dims[2]; - std::vector concat_dims = {first_in_dims[0], concat_dims_width}; - - { - Tensor2 tensor; - blobs_buff->reserve(concat_dims, &tensor); - internal_tensors_.push_back(tensor); - } - { - std::vector mat_dims = {first_in_dims[0], n_ins * n_ins}; - Tensor2 tensor; - blobs_buff->reserve(mat_dims, &tensor); - internal_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - blobs_buff->reserve(concat_dims, &tensor); - internal_tensors_.push_back(tensor); - } - } - - int concat_len = n_ins * (n_ins + 1) / 2 - n_ins; - std::vector out_dims = {first_in_dims[0], first_in_dims[1] + concat_len + 1}; - blobs_buff->reserve(out_dims, &out_tensor); - - in_tensors_.push_back(in_bottom_mlp_tensor); - in_tensors_.push_back(in_embeddings); - out_tensors_.push_back(out_tensor); - - } catch (const std::runtime_error &rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -InteractionLayerCPU::~InteractionLayerCPU(){}; - -template -void InteractionLayerCPU::fprop(bool is_train) { - T *concat = internal_tensors_[0].get_ptr(); - T *in_mlp = get_in_tensors(is_train)[0].get_ptr(); - T *in_emb = get_in_tensors(is_train)[1].get_ptr(); - T *mat = internal_tensors_[1].get_ptr(); - T *gather = out_tensors_[0].get_ptr(); - size_t h = internal_tensors_[0].get_dimensions()[0]; - size_t out_w = internal_tensors_[0].get_dimensions()[1]; - size_t in_w = get_in_tensors(is_train)[0].get_dimensions()[1]; - size_t n_emb = get_in_tensors(is_train)[1].get_dimensions()[1]; - size_t n_ins = 1 + n_emb; - - concat_cpu(h, in_w, out_w, n_ins, n_emb, true, concat, in_mlp, in_emb); - matmul_cpu(h, in_w, n_ins, concat, mat); - gather_concat_cpu(h, in_w, n_ins, in_mlp, mat, gather); -} - -template -void InteractionLayerCPU::bprop() {} - -template <> -void InteractionLayerCPU<__half>::bprop() {} - -template class InteractionLayerCPU; -template class InteractionLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/multi_cross_layer_cpu.cpp b/HugeCTR/src/cpu/layers/multi_cross_layer_cpu.cpp deleted file mode 100644 index 1b68aff0b6..0000000000 --- a/HugeCTR/src/cpu/layers/multi_cross_layer_cpu.cpp +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -namespace HugeCTR { - -namespace { - -void matrix_vec_mul(float* out, const float* in_m, const float* in_v, size_t h, size_t w) { - for (size_t j = 0; j < h; j++) { - out[j] = 0.0f; - for (size_t i = 0; i < w; i++) { - size_t k = j * w + i; - out[j] += in_m[k] * in_v[i]; - } - } -} - -void row_scaling(float* out, const float* in_m, const float* in_v, size_t h, size_t w) { - for (size_t j = 0; j < h; j++) { - for (size_t i = 0; i < w; i++) { - size_t k = j * w + i; - out[k] = in_m[k] * in_v[j]; - } - } -} - -void matrix_add(float* out, const float* in_m_1, const float* in_m_2, size_t h, size_t w) { - for (size_t j = 0; j < h; j++) { - for (size_t i = 0; i < w; i++) { - size_t k = j * w + i; - out[k] = in_m_1[k] + in_m_2[k]; - } - } -} - -void matrix_vec_add(float* out, const float* in_m, const float* in_v, size_t h, size_t w) { - for (size_t j = 0; j < h; j++) { - for (size_t i = 0; i < w; i++) { - size_t k = j * w + i; - out[k] = in_m[k] + in_v[i]; - } - } -} - -void multi_cross_fprop_cpu(int layers, size_t batchsize, size_t w, float** h_outputs, - float* h_input, float** h_hiddens, float** h_kernels, float** h_biases) { - for (int i = 0; i < layers; i++) { - matrix_vec_mul(h_hiddens[i], i == 0 ? h_input : h_outputs[i - 1], h_kernels[i], batchsize, w); - row_scaling(h_outputs[i], h_input, h_hiddens[i], batchsize, w); - matrix_add(h_outputs[i], h_outputs[i], i == 0 ? h_input : h_outputs[i - 1], batchsize, w); - matrix_vec_add(h_outputs[i], h_outputs[i], h_biases[i], batchsize, w); - } -} - -} // namespace - -MultiCrossLayerCPU::MultiCrossLayerCPU( - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blobs_buff, - const Tensor2& in_tensor, const Tensor2& out_tensor, int num_layers) - : LayerCPU(), num_layers_(num_layers) { - try { - // check the in_tensor and out_tensor - const auto& in_tensor_dim = in_tensor.get_dimensions(); - const auto& out_tensor_dim = out_tensor.get_dimensions(); - // 1. two dim? - if (in_tensor_dim.size() != 2 || out_tensor_dim.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor doesn't has two dimensions"); - } - // 2. same dim? - for (int i = 0; i < 2; i++) { - if (in_tensor_dim[i] != out_tensor_dim[i]) { - HCTR_OWN_THROW(Error_t::WrongInput, "input and output tensor doesn't match"); - } - } - size_t vec_length = in_tensor_dim[1]; - size_t batchsize = in_tensor_dim[0]; - - // check num_lyaers - if (num_layers < 1) { - HCTR_OWN_THROW(Error_t::WrongInput, "num_layers < 1"); - } - - std::vector weight_bias_dim = {1, vec_length}; - for (int i = 0; i < num_layers; i++) { - // setup weights - { - Tensor2 tensor; - weight_buff->reserve(weight_bias_dim, &tensor); - weights_.push_back(tensor); - } - // setup bias - { - Tensor2 tensor; - weight_buff->reserve(weight_bias_dim, &tensor); - weights_.push_back(tensor); - } - // setup weight gradient - { - Tensor2 tensor; - wgrad_buff->reserve(weight_bias_dim, &tensor); - wgrad_.push_back(tensor); - } - // setup bias gradient - { - Tensor2 tensor; - wgrad_buff->reserve(weight_bias_dim, &tensor); - wgrad_.push_back(tensor); - } - } - - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); - // setup blobs - - std::vector blob_dim = {batchsize, vec_length}; - blob_tensors_.push_back(in_tensor); - for (int i = 0; i < num_layers - 1; i++) { - Tensor2 tensor; - blobs_buff->reserve(blob_dim, &tensor); - blob_tensors_.push_back(tensor); - } - blob_tensors_.push_back(out_tensor); - - for (int i = 0; i < 3; i++) { - blobs_buff->reserve(blob_dim, &tmp_mat_tensors_[i]); - } - std::vector tmp_vec_dim = {batchsize, 1}; - blobs_buff->reserve(tmp_vec_dim, &tmp_vec_tensor_); - for (int i = 0; i < num_layers; i++) { - Tensor2 tensor; - blobs_buff->reserve(tmp_vec_dim, &tensor); - vec_tensors_.push_back(tensor); - } - - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -void MultiCrossLayerCPU::fprop(bool is_train) { - size_t vec_length = in_tensors_[0].get_dimensions()[1]; - size_t batchsize = in_tensors_[0].get_dimensions()[0]; - Tensors2 kernel_tensors; - Tensors2 bias_tensors; - Tensors2 output_tensors; - Tensors2 hidden_tensors; - - for (int i = 0; i < num_layers_; i++) { - kernel_tensors.push_back(weights_[2 * i]); - bias_tensors.push_back(weights_[2 * i + 1]); - } - - for (int i = 0; i < num_layers_; i++) { - output_tensors.push_back(blob_tensors_[i + 1]); - hidden_tensors.push_back(vec_tensors_[i]); - } - std::vector h_hiddens; - std::vector h_kernels; - std::vector h_biases; - std::vector h_outputs; - for (int i = 0; i < num_layers_; i++) { - h_hiddens.push_back(hidden_tensors[i].get_ptr()); - h_kernels.push_back(kernel_tensors[i].get_ptr()); - h_biases.push_back(bias_tensors[i].get_ptr()); - h_outputs.push_back(output_tensors[i].get_ptr()); - } - multi_cross_fprop_cpu(num_layers_, batchsize, vec_length, h_outputs.data(), - blob_tensors_[0].get_ptr(), h_hiddens.data(), h_kernels.data(), - h_biases.data()); -} - -void MultiCrossLayerCPU::bprop() {} - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/reduce_sum_layer_cpu.cpp b/HugeCTR/src/cpu/layers/reduce_sum_layer_cpu.cpp deleted file mode 100644 index d8060eece2..0000000000 --- a/HugeCTR/src/cpu/layers/reduce_sum_layer_cpu.cpp +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void reduce_sum_cpu(const T* input, T* output, std::vector dims, int axis) { - if (axis == 0) { - if (dims.size() == 1) { - for (size_t i = 0; i < dims[0]; i++) { - output[0] = input[i]; - } - } else if (dims.size() == 2) { - for (size_t k = 0; k < dims[1]; k++) { - output[k] = 0.0f; - for (size_t i = 0; i < dims[0]; i++) { - output[k] = output[k] + input[i * dims[1] + k]; - } - } - } else if (dims.size() == 3) { - for (size_t j = 0; j < dims[1]; j++) { - for (size_t k = 0; k < dims[2]; k++) { - output[j * dims[2] + k] = 0.0f; - for (size_t i = 0; i < dims[0]; i++) { - output[j * dims[2] + k] = - output[j * dims[2] + k] + input[i * dims[1] * dims[2] + j * dims[2] + k]; - } - } - } - } - } else if (axis == 1) { - if (dims.size() == 2) { - for (size_t i = 0; i < dims[0]; i++) { - output[i] = 0.0f; - for (size_t j = 0; j < dims[1]; j++) { - output[i] = output[i] + input[i * dims[1] + j]; - } - } - } else if (dims.size() == 3) { - for (size_t i = 0; i < dims[0]; i++) { - for (size_t k = 0; k < dims[2]; k++) { - output[i * dims[2] + k] = 0.0f; - for (size_t j = 0; j < dims[1]; j++) { - output[i * dims[2] + k] = - output[i * dims[2] + k] + input[i * dims[1] * dims[2] + j * dims[2] + k]; - } - } - } - } - } else if (axis == 2) { - for (size_t i = 0; i < dims[0]; i++) { - for (size_t j = 0; j < dims[1]; j++) { - output[i * dims[1] + j] = 0.0f; - for (size_t k = 0; k < dims[2]; k++) { - output[i * dims[1] + j] = - output[i * dims[1] + j] + input[i * dims[1] * dims[2] + j * dims[2] + k]; - } - } - } - } -} - -template -void reduce_sum_dgrad_cpu(const T* top_grad, T* dgrad, std::vector dims, int axis) { - if (axis == 0) { - if (dims.size() == 2) { - for (size_t j = 0; j < dims[1]; j++) { - for (size_t i = 0; i < dims[0]; i++) { - dgrad[i * dims[1] + j] = top_grad[j]; - } - } - } else if (dims.size() == 3) { - for (size_t j = 0; j < dims[1]; j++) { - for (size_t k = 0; k < dims[2]; k++) { - for (size_t i = 0; i < dims[0]; i++) { - dgrad[i * dims[1] * dims[2] + j * dims[2] + k] = top_grad[j * dims[2] + k]; - } - } - } - } - } else if (axis == 1) { - if (dims.size() == 2) { - for (size_t i = 0; i < dims[0]; i++) { - for (size_t j = 0; j < dims[1]; j++) { - dgrad[i * dims[1] + j] = top_grad[i]; - } - } - } else if (dims.size() == 3) { - for (size_t i = 0; i < dims[0]; i++) { - for (size_t k = 0; k < dims[2]; k++) { - for (size_t j = 0; j < dims[1]; j++) { - dgrad[i * dims[1] * dims[2] + j * dims[2] + k] = top_grad[i * dims[2] + k]; - } - } - } - } - } else if (axis == 2) { - for (size_t i = 0; i < dims[0]; i++) { - for (size_t j = 0; j < dims[1]; j++) { - for (size_t k = 0; k < dims[2]; k++) { - dgrad[i * dims[1] * dims[2] + j * dims[2] + k] = top_grad[i * dims[1] + j]; - } - } - } - } -} - -} // end of namespace - -template -ReduceSumLayerCPU::ReduceSumLayerCPU( - const Tensor2& in_tensor, Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff, int axis) - : LayerCPU(), axis_(axis) { - try { - // error input checking - const auto& in_dims = in_tensor.get_dimensions(); - for (auto i : in_dims) { - if (i == 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "The input dims can not be 0"); - } - } - if (axis >= (int)(in_dims.size()) || axis < 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "The axis is overflow"); - } - - std::vector out_dims(in_dims.size()); - for (int i = 0; i < (int)(in_dims.size()); i++) { - if (i == axis) { - out_dims[i] = 1; - } else { - out_dims[i] = in_dims[i]; - } - } - - blobs_buff->reserve(out_dims, &out_tensor); - out_tensors_.push_back(out_tensor); - in_tensors_.push_back(in_tensor); - - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void ReduceSumLayerCPU::fprop(bool is_train) { - T* input = in_tensors_[0].get_ptr(); - T* output = out_tensors_[0].get_ptr(); - auto in_dims = in_tensors_[0].get_dimensions(); - auto out_dims = out_tensors_[0].get_dimensions(); - std::vector dims; - for (auto dim : in_dims) { - dims.push_back(dim); - } - reduce_sum_cpu(input, output, dims, axis_); -} - -template -void ReduceSumLayerCPU::bprop() {} - -template class ReduceSumLayerCPU; -template class ReduceSumLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/relu_layer_cpu.cpp b/HugeCTR/src/cpu/layers/relu_layer_cpu.cpp deleted file mode 100644 index da48ba07fa..0000000000 --- a/HugeCTR/src/cpu/layers/relu_layer_cpu.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void relu_cpu(T* top, const T* bottom, int len) { - for (int i = 0; i < len; ++i) { - if (bottom[i] > T(0.)) { - top[i] = bottom[i]; - } else { - top[i] = T(0.); - } - } -} - -template -void relu_bprop_cpu(T* d_bottom, const T* d_top, const T* bottom, int len) { - for (int i = 0; i < len; ++i) { - if (bottom[i] > T(0.)) { - d_bottom[i] = d_top[i]; - } else { - d_bottom[i] = T(0.); - } - } -} - -} // namespace - -template -ReluLayerCPU::ReluLayerCPU(const Tensor2& in_tensor, const Tensor2& out_tensor) - : LayerCPU() { - assert(in_tensor.get_num_elements() == out_tensor.get_num_elements()); - assert(in_tensor.get_num_elements() % 2 == 0); - - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); -} - -template -void ReluLayerCPU::fprop(bool is_train) { - int len = in_tensors_[0].get_num_elements(); - - relu_cpu(out_tensors_[0].get_ptr(), in_tensors_[0].get_ptr(), len); -} - -template -void ReluLayerCPU::bprop() {} - -template class ReluLayerCPU; -template class ReluLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/reshape_layer_cpu.cpp b/HugeCTR/src/cpu/layers/reshape_layer_cpu.cpp deleted file mode 100644 index 5b2fb15021..0000000000 --- a/HugeCTR/src/cpu/layers/reshape_layer_cpu.cpp +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void reshape_fprop_cpu(int batch_size, int n_slot, int vector_length, size_t num_elements, - std::vector selected, T* h_in, T* h_ref) { - int n_active_slot = selected.empty() ? n_slot : int(selected.size()); - if (selected.empty()) { - for (size_t i = 0; i < num_elements; i++) { - h_ref[i] = h_in[i]; - } - } else { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < n_active_slot; j++) { - for (int k = 0; k < vector_length; k++) { - int in_idx = i * (n_slot * vector_length) + selected[j] * vector_length + k; - int out_idx = i * (n_active_slot * vector_length) + j * vector_length + k; - h_ref[out_idx] = h_in[in_idx]; - } - } - } - } -} - -} // anonymous namespace - -template -ReshapeLayerCPU::ReshapeLayerCPU( - const Tensor2& in_tensor, Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff, size_t leading_dim) - : LayerCPU(), - in_place_(true), - batch_size_(0), - n_slot_(0), - vector_length_(0), - n_active_slot_(0) { - try { - const std::vector& in_dims = in_tensor.get_dimensions(); - int im_idx = in_dims.size() - 1; - if (leading_dim < in_dims[im_idx] || leading_dim % in_dims[im_idx] != 0) { - HCTR_OWN_THROW(Error_t::WrongInput, - "leading_dim < in_dims[im_idx] or leading_dim % in_dims[2] != 0"); - } - - size_t n_in_elems = in_tensor.get_num_elements(); - if (leading_dim > n_in_elems) { - HCTR_OWN_THROW(Error_t::WrongInput, "leading_dim cannot be bigger than n_in_elems"); - } - - if (n_in_elems % leading_dim != 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "n_in_elems % leading_dim != 0"); - } - - size_t trailing_dim = n_in_elems / leading_dim; - std::vector out_dims = {trailing_dim, leading_dim}; - - blobs_buff->reserve(out_dims, &out_tensor); - - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); - - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -ReshapeLayerCPU::ReshapeLayerCPU( - const Tensor2& in_tensor, Tensor2& out_tensor, - const std::shared_ptr>& blobs_buff, std::vector& selected) - : LayerCPU(), - in_place_(selected.empty()), - batch_size_(0), - n_slot_(0), - vector_length_(0), - n_active_slot_(selected.size()), - selected_(selected) { - try { - const std::vector& in_dims = in_tensor.get_dimensions(); - if (in_dims[1] < n_active_slot_) { - HCTR_OWN_THROW(Error_t::WrongInput, "selected is invalid"); - } - - size_t in_dims_1 = selected.empty() ? in_dims[1] : n_active_slot_; - std::vector out_dims = {in_dims[0], in_dims_1 * in_dims[2]}; - blobs_buff->reserve(out_dims, &out_tensor); - - if (!in_place_) { - unsigned int i = 0; - for (; i < in_dims.size() - 2; i++) batch_size_ += in_dims[i]; - n_slot_ = in_dims[i++]; - vector_length_ = in_dims[i]; - - blobs_buff->reserve({n_active_slot_}, &selected_tensor_); - } - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); - - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void ReshapeLayerCPU::fprop(bool is_train) { - T* h_in = in_tensors_[0].get_ptr(); - T* h_out = out_tensors_[0].get_ptr(); - size_t num_elements = in_tensors_[0].get_num_elements(); - if (in_place_) { - for (size_t i = 0; i < num_elements; i++) { - h_out[i] = h_in[i]; - } - } else { - reshape_fprop_cpu(batch_size_, n_slot_, vector_length_, num_elements, selected_, h_in, h_out); - } -} - -template -void ReshapeLayerCPU::bprop() {} - -template class ReshapeLayerCPU; -template class ReshapeLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/sigmoid_layer_cpu.cpp b/HugeCTR/src/cpu/layers/sigmoid_layer_cpu.cpp deleted file mode 100644 index 2521564537..0000000000 --- a/HugeCTR/src/cpu/layers/sigmoid_layer_cpu.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void sigmoid_cpu(T* top, const T* bottom, int len) { - for (int i = 0; i < len; ++i) { - top[i] = T(1.) / (T(1.) + exp(-bottom[i])); - } -} - -template <> -void sigmoid_cpu(__half* top, const __half* bottom, int len) { - for (int i = 0; i < len; ++i) { - top[i] = __float2half(1.0 / (1.0 + exp(-__half2float(bottom[i])))); - } -} - -template -void sigmoid_bprop_cpu(T* d_bottom, const T* d_top, const T* bottom, int len) { - for (int i = 0; i < len; ++i) { - T y = T(1.) / (T(1.) + exp(-bottom[i])); - d_bottom[i] = d_top[i] * y * (T(1.) - y); - } -} - -template <> -void sigmoid_bprop_cpu(__half* d_bottom, const __half* d_top, const __half* bottom, int len) { - for (int i = 0; i < len; ++i) { - float y = 1.0 / (1.0 + exp(-__half2float(bottom[i]))); - d_bottom[i] = __float2half(__half2float(d_top[i]) * y * (1.0 - y)); - } -} - -} // end namespace - -template -SigmoidLayerCPU::SigmoidLayerCPU(const Tensor2& in_tensor, const Tensor2& out_tensor) - : LayerCPU() { - assert(in_tensor.get_num_elements() == out_tensor.get_num_elements()); - assert(in_tensor.get_num_elements() % 2 == 0); - - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); -} - -template -void SigmoidLayerCPU::fprop(bool is_train) { - int len = in_tensors_[0].get_num_elements(); - - sigmoid_cpu(out_tensors_[0].get_ptr(), in_tensors_[0].get_ptr(), len); -} - -template -void SigmoidLayerCPU::bprop() {} - -template class SigmoidLayerCPU; -template class SigmoidLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/slice_layer_cpu.cpp b/HugeCTR/src/cpu/layers/slice_layer_cpu.cpp deleted file mode 100644 index c800219fa0..0000000000 --- a/HugeCTR/src/cpu/layers/slice_layer_cpu.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void slice_fprop_cpu(size_t height, size_t width, std::vector>& ranges, - size_t n_outs, T* h_in, T** h_refs) { - int i = 0; - for (auto& range : ranges) { - int out_width = range.second - range.first; - for (size_t r = 0; r < height; r++) { - for (int c = range.first; c < range.second; c++) { - int in_idx = r * width + c; - int out_idx = r * out_width + c - range.first; - h_refs[i][out_idx] = h_in[in_idx]; - } - } - i++; - } -} - -} // anonymous namespace - -template -SliceLayerCPU::SliceLayerCPU(const Tensor2& in_tensor, Tensors2& out_tensors, - const std::shared_ptr>& blobs_buff, - std::vector>& ranges) - : LayerCPU(), virt_w_(0), ranges_(ranges) { - try { - if (ranges.empty()) { - HCTR_OWN_THROW(Error_t::WrongInput, "Empty slice ranges is not allowed"); - } - - if (!out_tensors.empty()) { - HCTR_OWN_THROW(Error_t::WrongInput, "output tensor vector must be empty"); - } - - auto in_dims = in_tensor.get_dimensions(); - if (in_dims.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "Only 2D tensors can be concatenated"); - } - - size_t height = in_dims[0]; - int in_w = in_dims[1]; - int prev_min = -1; - int prev_max = 0; - for (auto& range : ranges) { - int cur_min = range.first; - int cur_max = range.second; - if (cur_min >= cur_max) { - HCTR_OWN_THROW(Error_t::WrongInput, "Reverse range is not allowed"); - } - if (cur_min < 0 || cur_max < 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "Negative ranges cannot be allowed"); - } - if (!(prev_min <= cur_min && prev_max <= cur_max)) { - HCTR_OWN_THROW(Error_t::WrongInput, "A range cannot be out-order nor included in another"); - } - if (cur_min >= in_w || cur_max > in_w) { - HCTR_OWN_THROW(Error_t::WrongInput, "Ranges cannot be bigger than the input width"); - } - size_t out_w = cur_max - cur_min; - std::vector out_dims = {height, out_w}; - { - Tensor2 tensor; - blobs_buff->reserve(out_dims, &tensor); - out_tensors.push_back(tensor); - } - sts_.push_back(cur_min); - virt_w_ += out_w; - - prev_min = cur_min; - prev_max = cur_max; - } - - in_tensors_.push_back(in_tensor); - for (auto& out_tensor : out_tensors) { - out_tensors_.push_back(out_tensor); - } - - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void SliceLayerCPU::fprop(bool is_train) { - T* in = in_tensors_[0].get_ptr(); - size_t n_out_tensors = out_tensors_.size(); - std::vector out; - for (auto out_tensor : out_tensors_) { - out.push_back(out_tensor.get_ptr()); - } - size_t height = in_tensors_[0].get_dimensions()[0]; - size_t width = in_tensors_[0].get_dimensions()[1]; - slice_fprop_cpu(height, width, ranges_, n_out_tensors, in, out.data()); -} - -template -void SliceLayerCPU::bprop() {} - -template class SliceLayerCPU; -template class SliceLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/layers/weight_multiply_layer_cpu.cpp b/HugeCTR/src/cpu/layers/weight_multiply_layer_cpu.cpp deleted file mode 100644 index 46c8caf6cc..0000000000 --- a/HugeCTR/src/cpu/layers/weight_multiply_layer_cpu.cpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -void weight_multiply_cpu(const T* input, const T* weight, T* output, int batch_size, int slot_num, - int embedding_vec_size) { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < slot_num; j++) { - for (int k = 0; k < embedding_vec_size; k++) { - output[i * slot_num * embedding_vec_size + j * embedding_vec_size + k] = - input[i * slot_num + j] * weight[j * embedding_vec_size + k]; - } - } - } -} - -template -void weight_multiply_wgrad_cpu(const T* top_grad, const T* input, T* wgrad, int batch_size, - int slot_num, int embedding_vec_size) { - int len_w = slot_num * embedding_vec_size; - for (int i = 0; i < len_w; i++) { - double tmp = 0.0; - for (int j = 0; j < batch_size; j++) { - tmp += (double)input[j * slot_num + i / embedding_vec_size] * (double)top_grad[j * len_w + i]; - } - wgrad[i] = (T)tmp; - } -} - -template -void weight_multiply_dgrad_cpu(const T* top_grad, const T* weight, T* dgrad, int batch_size, - int slot_num, int embedding_vec_size) { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < slot_num; j++) { - T tmp = T(0.0); - for (int k = 0; k < embedding_vec_size; k++) { - tmp = tmp + T(top_grad[i * slot_num * embedding_vec_size + j * embedding_vec_size + k] * - weight[j * embedding_vec_size + k]); - } - dgrad[i * slot_num + j] = tmp; - } - } -} - -} // end of namespace - -template -WeightMultiplyLayerCPU::WeightMultiplyLayerCPU( - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& blob_buff, const Tensor2& in_tensor, - Tensor2& out_tensor, const std::vector& weight_dims) - : LayerCPU() { - try { - const auto& in_dims = in_tensor.get_dimensions(); - if (in_dims.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "Only 2D tensors can be multiplied"); - } - if (weight_dims.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "Only 2D weights is allowed for weight_multiply layer"); - } - if (weight_dims[0] != in_dims[1]) { - HCTR_OWN_THROW(Error_t::WrongInput, "weight_dims[0] must be equal to in_dims[1]"); - } - - batch_size_ = in_dims[0]; - slot_num_ = weight_dims[0]; - embedding_vec_size_ = weight_dims[1]; - - std::vector out_dims{batch_size_, slot_num_ * embedding_vec_size_}; - blob_buff->reserve(out_dims, &out_tensor); - in_tensors_.push_back(in_tensor); - out_tensors_.push_back(out_tensor); - - { - Tensor2 tensor; - weight_buff->reserve(weight_dims, &tensor); - weights_.push_back(tensor); - } - { - Tensor2 tensor; - wgrad_buff->reserve(weight_dims, &tensor); - wgrad_.push_back(tensor); - } - - blob_buff->reserve(out_dims, &wgrad_tmp_trans_); - - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void WeightMultiplyLayerCPU::fprop(bool is_train) { - T* input = in_tensors_[0].get_ptr(); - T* weight = weights_[0].get_ptr(); - T* output = out_tensors_[0].get_ptr(); - weight_multiply_cpu(input, weight, output, batch_size_, slot_num_, embedding_vec_size_); -} - -template -void WeightMultiplyLayerCPU::bprop() {} - -template class WeightMultiplyLayerCPU; -template class WeightMultiplyLayerCPU<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/cpu/network_cpu.cpp b/HugeCTR/src/cpu/network_cpu.cpp deleted file mode 100644 index ee4464576f..0000000000 --- a/HugeCTR/src/cpu/network_cpu.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -namespace HugeCTR { - -NetworkCPU::NetworkCPU(const std::shared_ptr& cpu_resource, bool use_mixed_precision) - : cpu_resource_(cpu_resource), use_mixed_precision_(use_mixed_precision) {} - -void NetworkCPU::conv_weight_(Tensor2<__half>& target, const Tensor2& source) { - size_t elems = source.get_num_elements(); - if (target.get_num_elements() != source.get_num_elements()) - HCTR_OWN_THROW(Error_t::WrongInput, "weight size of target != weight size of in"); - __half* h_target = target.get_ptr(); - const float* h_source = source.get_ptr(); - for (size_t i = 0; i < elems; i++) { - h_target[i] = __float2half(h_source[i]); - } -} - -void NetworkCPU::predict() { - if (use_mixed_precision_) { - conv_weight_(weight_tensor_half_, weight_tensor_); - } - // forward - for (auto& layer : layers_) { - layer->fprop(false); - } - return; -} - -void NetworkCPU::load_params_from_model(const std::string& model_file) { - std::ifstream model_stream(model_file, std::ifstream::binary); - if (!model_stream.is_open()) { - std::ostringstream os; - os << "Cannot open dense model file (reason: " << std::strerror(errno) << ')'; - HCTR_OWN_THROW(Error_t::WrongInput, os.str()); - } - model_stream.read((char*)weight_tensor_.get_ptr(), weight_tensor_.get_size_in_bytes()); - model_stream.close(); - return; -} - -void NetworkCPU::initialize() { - for (auto& layer : layers_) { - layer->initialize(); - } -} - -} // namespace HugeCTR diff --git a/HugeCTR/src/inference/CMakeLists.txt b/HugeCTR/src/inference/CMakeLists.txt deleted file mode 100644 index cd62a1624e..0000000000 --- a/HugeCTR/src/inference/CMakeLists.txt +++ /dev/null @@ -1,133 +0,0 @@ -# -# Copyright (c) 2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -cmake_minimum_required(VERSION 3.20) -set(DB_LIB_PATHS "/usr/local/lib" CACHE PATH "Paths to Hiredis/rocksdb lib") - -file(GLOB huge_ctr_inference_src - ../network_buffer_channels.cpp - ../cpu_resource.cpp - ../gpu_resource.cpp - ../resource_manager.cpp - ../resource_managers/resource_manager_core.cpp - ../data_simulator.cu - ../graph_wrapper.cpp - ../layer.cpp - ../layers/batch_norm_layer.cu - ../layers/layer_norm_layer.cu - ../layers/cast_layer.cu - ../layers/concat_layer.cu - ../layers/concat_3d_layer.cu - ../layers/dropout_layer.cu - ../layers/elu_layer.cu - ../layers/fully_connected_layer.cu - ../layers/fully_connected_layer_half.cu - ../layers/fused_fully_connected_layer.cu - ../layers/fused_relu_bias_fully_connected_layer.cu - ../layers/functors/fused_fc_layer_functors.cu - ../layers/functors/fused_gemm_functors.cu - ../layers/mlp_layer.cu - ../layers/interaction_layer.cu - ../layers/relu_layer.cu - ../layers/reshape_layer.cu - ../layers/sigmoid_layer.cu - ../layers/slice_layer.cu - ../layers/fm_order2_layer.cu - ../layers/weight_multiply_layer.cu - ../layers/multi_cross_layer.cu - ../layers/add_layer.cu - ../layers/reduce_sum_layer.cu - ../layers/elementwise_multiply_layer.cu - ../layers/gru_layer.cu - ../layers/matrix_multiply_layer.cu - ../layers/multi_head_attention_layer.cu - ../layers/prelu_dice_layer.cu - ../layers/softmax_layer.cu - ../layers/masked_softmax_layer.cu - ../layers/scale_layer.cu - ../layers/fused_reshape_concat_general_layer.cu - ../layers/fused_reshape_concat_layer.cu - ../layers/sub_layer.cu - ../layers/gather_layer.cu - ../layers/reduce_mean_layer.cu - ../layers/sequence_mask_layer.cu - ../trainable_layer.cpp - ../loss.cu - ../network.cu - ../network.cpp - ../gpu_learning_rate_scheduler.cu - ../metrics.cu - ../optimizers/*.cu - ../optimizer.cpp - ../regularizer.cu - ../regularizers/l1_regularizer.cu - ../regularizers/l2_regularizer.cu - ../regularizers/no_regularizer.cu - ../parsers/solver_parser.cpp - ../parsers/learning_rate_scheduler_parser.cpp - ../parsers/create_optimizer.cpp - ../parsers/create_network.cpp - ../parsers/inference_parser.cpp - ../diagnose.cu - ../pipeline.cpp - embedding_feature_combiner.cu - inference_session.cpp - ../io/filesystem.cpp - ../io/hadoop_filesystem.cpp - ../io/s3_filesystem.cpp - ../io/local_filesystem.cpp - ../io/gcs_filesystem.cpp - ../network_buffer_channels.cpp -) - -add_library(huge_ctr_inference SHARED ${huge_ctr_inference_src}) - -if(ENABLE_HDFS) - target_link_libraries( - huge_ctr_inference - PUBLIC - ${DB_LIB_PATHS}/libhdfs.so # from Hugectr - ) -endif() - -if(ENABLE_S3) - target_link_libraries( - huge_ctr_inference - PUBLIC - ${DB_LIB_PATHS}/libaws-cpp-sdk-core.so ${DB_LIB_PATHS}/libaws-cpp-sdk-s3.so # from Hugectr - ) -endif() - -if(ENABLE_GCS) - target_link_libraries(huge_ctr_inference PUBLIC google_cloud_cpp_storage) -endif() - -target_link_libraries(huge_ctr_inference PUBLIC hugectr_core23) - -target_link_libraries(huge_ctr_inference PUBLIC CUDA::cuda_driver ${CUDART_LIB} CUDA::cublas CUDA::curand cudnn nccl) - -target_link_libraries(huge_ctr_inference PUBLIC cudf) - -target_link_libraries(huge_ctr_inference PUBLIC ${CMAKE_THREAD_LIBS_INIT} numa stdc++fs tbb) - -if(Parquet_FOUND) -target_link_libraries(huge_ctr_inference PUBLIC parquet) -endif() - -target_link_libraries(huge_ctr_inference PUBLIC huge_ctr_hps) - -target_compile_features(huge_ctr_inference PUBLIC cxx_std_17) - - diff --git a/HugeCTR/src/inference/embedding_feature_combiner.cu b/HugeCTR/src/inference/embedding_feature_combiner.cu deleted file mode 100644 index cc2dec64a8..0000000000 --- a/HugeCTR/src/inference/embedding_feature_combiner.cu +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace { - -template -__global__ void embedding_feature_combine_kernel(const float* input, TypeEmbedding* output, - const int* row_ptrs, int batch_size, int slot_num, - int embedding_vec_size, - EmbeddingFeatureCombiner_t combiner_type) { - const auto& block = cooperative_groups::this_thread_block(); - // each block partition corresponding to one sample - const int bid = block.group_index().x; - // each thread corresponding to one element in the embedding vector - const int tid = block.thread_rank(); - - if (bid < batch_size && tid < embedding_vec_size) { - for (int i = 0; i < slot_num; i++) { - int feature_row_index = bid * slot_num + i; - int row_offset = row_ptrs[feature_row_index]; // row offset within input - int feature_num = - row_ptrs[feature_row_index + 1] - row_offset; // num of feature vectors in one slot - - float tmp = 0.0f; - // reduce in one slot - for (int j = 0; j < feature_num; j++) - tmp += input[(row_offset + j) * embedding_vec_size + tid]; - - if (combiner_type == EmbeddingFeatureCombiner_t::Mean && feature_num > 1) { - tmp /= feature_num; - } - output[feature_row_index * embedding_vec_size + tid] = tmp; - } // end for - } // end if -} - -template <> -__global__ void embedding_feature_combine_kernel(const float* input, __half* output, - const int* row_ptrs, int batch_size, int slot_num, - int embedding_vec_size, - EmbeddingFeatureCombiner_t combiner_type) { - const auto& block = cooperative_groups::this_thread_block(); - // each block partition corresponding to one sample - const int bid = block.group_index().x; - // each thread corresponding to one element in the embedding vector - const int tid = block.thread_rank(); - - if (bid < batch_size && tid < embedding_vec_size) { - for (int i = 0; i < slot_num; i++) { - int feature_row_index = bid * slot_num + i; - int row_offset = row_ptrs[feature_row_index]; // row offset within input - int feature_num = - row_ptrs[feature_row_index + 1] - row_offset; // num of feature vectors in one slot - - float tmp = 0.0f; - // reduce in one slot - for (int j = 0; j < feature_num; j++) - tmp += input[(row_offset + j) * embedding_vec_size + tid]; - - if (combiner_type == EmbeddingFeatureCombiner_t::Mean && feature_num > 1) { - tmp /= feature_num; - } - output[feature_row_index * embedding_vec_size + tid] = __float2half(tmp); - } // end for - } // end if -} - -template -__global__ void embedding_feature_combine_tiled_kernel(const float* input, TypeEmbedding* output, - const int* row_ptrs, int batch_size, - int slot_num, int embedding_vec_size, - EmbeddingFeatureCombiner_t combiner_type) { - const auto& block = cooperative_groups::this_thread_block(); - const auto& tile = cooperative_groups::tiled_partition(block); - // each block partition corresponding to one sample - const int bid = block.group_index().x * tile.meta_group_size() + tile.meta_group_rank(); - // each thread corresponding to one element in the embedding vector - const int tid = tile.thread_rank(); - - if (bid < batch_size && tid < embedding_vec_size) { - for (int i = 0; i < slot_num; i++) { - int feature_row_index = bid * slot_num + i; - int row_offset = row_ptrs[feature_row_index]; // row offset within input - int feature_num = - row_ptrs[feature_row_index + 1] - row_offset; // num of feature vectors in one slot - - float tmp = 0.0f; - // reduce in one slot - for (int j = 0; j < feature_num; j++) - tmp += input[(row_offset + j) * embedding_vec_size + tid]; - - if (combiner_type == EmbeddingFeatureCombiner_t::Mean && feature_num > 1) { - tmp /= feature_num; - } - output[feature_row_index * embedding_vec_size + tid] = tmp; - } // end for - } // end if -} - -template -__global__ void embedding_feature_combine_tiled_kernel(const float* input, __half* output, - const int* row_ptrs, int batch_size, - int slot_num, int embedding_vec_size, - EmbeddingFeatureCombiner_t combiner_type) { - const auto& block = cooperative_groups::this_thread_block(); - const auto& tile = cooperative_groups::tiled_partition(block); - // each block partition corresponding to one sample - const int bid = block.group_index().x * tile.meta_group_size() + tile.meta_group_rank(); - // each thread corresponding to one element in the embedding vector - const int tid = tile.thread_rank(); - - if (bid < batch_size && tid < embedding_vec_size) { - for (int i = 0; i < slot_num; i++) { - int feature_row_index = bid * slot_num + i; - int row_offset = row_ptrs[feature_row_index]; // row offset within input - int feature_num = - row_ptrs[feature_row_index + 1] - row_offset; // num of feature vectors in one slot - - float tmp = 0.0f; - // reduce in one slot - for (int j = 0; j < feature_num; j++) - tmp += input[(row_offset + j) * embedding_vec_size + tid]; - - if (combiner_type == EmbeddingFeatureCombiner_t::Mean && feature_num > 1) { - tmp /= feature_num; - } - output[feature_row_index * embedding_vec_size + tid] = __float2half(tmp); - } // end for - } // end if -} - -template -void launch_embedding_feature_combine_kernel(const float* input, TypeEmbedding* output, - const int* row_ptrs, int batch_size, int slot_num, - int embedding_vec_size, - EmbeddingFeatureCombiner_t combiner_type, - cudaStream_t stream) { - if (embedding_vec_size <= 2) { - embedding_feature_combine_tiled_kernel - <<<(batch_size - 1) / 32 + 1, 64, 0, stream>>>(input, output, row_ptrs, batch_size, - slot_num, embedding_vec_size, combiner_type); - } else if (embedding_vec_size <= 4) { - embedding_feature_combine_tiled_kernel - <<<(batch_size - 1) / 16 + 1, 64, 0, stream>>>(input, output, row_ptrs, batch_size, - slot_num, embedding_vec_size, combiner_type); - } else if (embedding_vec_size <= 8) { - embedding_feature_combine_tiled_kernel - <<<(batch_size - 1) / 8 + 1, 64, 0, stream>>>(input, output, row_ptrs, batch_size, slot_num, - embedding_vec_size, combiner_type); - } else if (embedding_vec_size <= 16) { - embedding_feature_combine_tiled_kernel - <<<(batch_size - 1) / 4 + 1, 64, 0, stream>>>(input, output, row_ptrs, batch_size, slot_num, - embedding_vec_size, combiner_type); - } else if (embedding_vec_size <= 32) { - embedding_feature_combine_tiled_kernel - <<<(batch_size - 1) / 2 + 1, 64, 0, stream>>>(input, output, row_ptrs, batch_size, slot_num, - embedding_vec_size, combiner_type); - } else { - // each thread corresponds to one element in an embedding vector - embedding_feature_combine_kernel<<>>( - input, output, row_ptrs, batch_size, slot_num, embedding_vec_size, combiner_type); - } -} - -} // end of namespace - -template -EmbeddingFeatureCombiner::EmbeddingFeatureCombiner( - const std::shared_ptr& in_tensor, - const std::shared_ptr& row_ptrs_tensor, Tensor2& out_tensor, - int batch_size, int slot_num, EmbeddingFeatureCombiner_t combiner_type, - const std::shared_ptr>& blobs_buff, - const std::shared_ptr& gpu_resource) - : Layer(gpu_resource), - slot_num_(slot_num), - batch_size_(batch_size), - combiner_type_(combiner_type) { - try { - // error input checking - const auto& in_shape = in_tensor->shape(); - const auto& row_ptrs_shape = row_ptrs_tensor->shape(); - - if (in_shape.dims() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "The input tensor must be 2D"); - } - for (int64_t i{0}; i < in_shape.dims(); ++i) { - if (in_shape.size(i) == 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "The input dims can not be 0"); - } - } - - if (row_ptrs_shape.dims() != 1) { - HCTR_OWN_THROW(Error_t::WrongInput, "The row pointers tensor must be 1D"); - } - if (row_ptrs_shape.size(0) != batch_size * slot_num + 1) { - HCTR_OWN_THROW(Error_t::WrongInput, - "The dimension of row pointers tensor mismatch number of samples"); - } - - embedding_vec_size_ = in_shape.size(1); - std::vector out_shape{static_cast(batch_size_), static_cast(slot_num_), - static_cast(embedding_vec_size_)}; - blobs_buff->reserve(out_shape, &out_tensor); - out_tensors_.push_back(out_tensor); - in_tensors_.push_back(in_tensor); - row_ptrs_tensors_.push_back(row_ptrs_tensor); - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void EmbeddingFeatureCombiner::fprop(bool is_train) { - if (is_train) { - HCTR_OWN_THROW(Error_t::IllegalCall, - "The fprop() of EmbeddingFeatureCombiner should only be used for inference"); - } - - CudaDeviceContext context(get_device_id()); - float* input = in_tensors_[0]->data(); - int* row_ptrs = row_ptrs_tensors_[0]->data(); - TypeEmbedding* output = out_tensors_[0].get_ptr(); - - launch_embedding_feature_combine_kernel(input, output, row_ptrs, batch_size_, slot_num_, - embedding_vec_size_, combiner_type_, - get_gpu().get_stream()); -} - -template class EmbeddingFeatureCombiner; -template class EmbeddingFeatureCombiner<__half>; - -} // namespace HugeCTR diff --git a/HugeCTR/src/inference/inference_session.cpp b/HugeCTR/src/inference/inference_session.cpp deleted file mode 100644 index 77587b23cb..0000000000 --- a/HugeCTR/src/inference/inference_session.cpp +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace HugeCTR { - -InferenceSessionBase::~InferenceSessionBase() = default; - -std::shared_ptr InferenceSessionBase::create( - const std::string& model_config_path, const InferenceParams& inference_params, - const std::shared_ptr& embedding_cache) { - return std::make_shared(model_config_path, inference_params, embedding_cache); -} - -InferenceSession::InferenceSession(const std::string& model_config_path, - const InferenceParams& inference_params, - const std::shared_ptr& embedding_cache, - std::shared_ptr resource_manager) - : InferenceSessionBase(), - config_(read_json_file(model_config_path)), - embedding_table_slot_size_({0}), - embedding_cache_(embedding_cache), - inference_parser_(config_), - inference_params_(inference_params) { - try { - HCTR_LOG(WARNING, ROOT, - "InferenceModel and InferenceSession will be deprecated in a future release." - "Please see the alternatives based on TensorRT and TensorFlow:\n" - "\thttps://nvidia-merlin.github.io/HugeCTR/main/hierarchical_parameter_server/" - "hps_tf_user_guide.html\n" - "\thttps://nvidia-merlin.github.io/HugeCTR/main/hierarchical_parameter_server/" - "hps_trt_user_guide.html\n"); - - if (inference_params_.use_gpu_embedding_cache && - embedding_cache->get_device_id() != inference_params_.device_id) { - HCTR_OWN_THROW( - Error_t::WrongInput, - "The device id of inference_params is not consistent with that of embedding cache."); - } - resource_manager_ = resource_manager != nullptr - ? resource_manager - : ResourceManagerCore::create({{inference_params.device_id}}, 0); - HCTR_LOG(TRACE, ROOT, "Create inference session on device: %d\n", inference_params_.device_id); - auto b2s = [](const char val) { return val ? "True" : "False"; }; - HCTR_LOG(INFO, ROOT, "Model name: %s\n", inference_params_.model_name.c_str()); - HCTR_LOG(INFO, ROOT, "Use mixed precision: %s\n", b2s(inference_params.use_mixed_precision)); - HCTR_LOG(INFO, ROOT, "Use cuda graph: %s\n", b2s(inference_params.use_cuda_graph)); - HCTR_LOG(INFO, ROOT, "Max batchsize: %lu\n", inference_params.max_batchsize); - HCTR_LOG(INFO, ROOT, "Use I64 input key: %s\n", b2s(inference_params.i64_input_key)); - Network* network_ptr; - inference_parser_.create_pipeline(inference_params_, dense_input_tensorbag_, row_ptrs_tensors_, - embedding_features_tensors_, embedding_table_slot_size_, - &embedding_feature_combiners_, &network_ptr, - inference_tensor_entries_, resource_manager_); - auto dense_network_feedforward = - std::make_shared([=] { network_->predict(); }); - predict_network_pipeline_ = Pipeline( - "default", resource_manager_->get_local_gpu_from_device_id(inference_params.device_id), - {dense_network_feedforward}); - - network_ = std::move(std::unique_ptr(network_ptr)); - network_->initialize(false); - if (inference_params.use_algorithm_search) { - network_->search_algorithm(); - } - if (inference_params_.dense_model_file.size() > 0) { - network_->upload_params_to_device_inference(inference_params_.dense_model_file); - } - if (inference_params_.non_trainable_params_file.size() > 0) { - network_->upload_non_trainable_params_to_device_inference( - inference_params_.non_trainable_params_file); - } - CudaDeviceContext context(inference_params_.device_id); - for (size_t idx = 0; idx < inference_params_.sparse_model_files.size(); ++idx) { - cudaStream_t stream; - cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); - streams_.push_back(stream); - } - h_row_ptrs_ = (int*)malloc((inference_params_.max_batchsize * inference_parser_.slot_num + - inference_parser_.num_embedding_tables) * - sizeof(int)); - // h_keys_ is a void pointer, which serves key types of both long long and unsigned int - h_keys_ = malloc(inference_params_.max_batchsize * - inference_parser_.max_feature_num_per_sample * sizeof(long long)); - - cudaMallocManaged(&d_keys_, inference_params_.max_batchsize * - inference_parser_.max_feature_num_per_sample * - sizeof(long long)); - HCTR_LIB_THROW(cudaMalloc((void**)&d_embedding_vectors_, - inference_params_.max_batchsize * - inference_parser_.max_embedding_vector_size_per_sample * - sizeof(float))); - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } - return; -} - -InferenceSession::~InferenceSession() { - CudaDeviceContext context(inference_params_.device_id); - cudaFree(d_embedding_vectors_); - free(h_keys_); - free(h_row_ptrs_); - cudaFree(d_keys_); - for (auto stream : streams_) cudaStreamDestroy(stream); -} - -void InferenceSession::predict_impl(float* d_dense, void* keys, bool key_on_device, int* d_row_ptrs, - float* d_output, int num_samples, int num_embedding_tables, - bool table_major_key_layout) { - CudaDeviceContext context( - resource_manager_->get_local_gpu_from_device_id(inference_params_.device_id) - ->get_device_id()); - // embedding_cache lookup - size_t acc_vectors_offset{0}; - size_t acc_row_ptrs_offset{0}; - size_t acc_keys_offset{0}; - size_t num_keys{0}; - for (size_t i = 0; i < num_embedding_tables; ++i) { - acc_row_ptrs_offset += num_samples * inference_parser_.slot_num_for_tables[i] + 1; - num_keys = h_row_ptrs_[acc_row_ptrs_offset - 1]; - if (inference_params_.i64_input_key) { - if (key_on_device) { - embedding_cache_->lookup_from_device(i, d_embedding_vectors_ + acc_vectors_offset, - static_cast(keys) + acc_keys_offset, - num_keys, inference_params_.hit_rate_threshold, - streams_[i]); - } else { - embedding_cache_->lookup(i, d_embedding_vectors_ + acc_vectors_offset, - static_cast(keys) + acc_keys_offset, num_keys, - inference_params_.hit_rate_threshold, streams_[i]); - } - } else { - if (key_on_device) { - embedding_cache_->lookup_from_device( - i, d_embedding_vectors_ + acc_vectors_offset, - static_cast(keys) + acc_keys_offset, num_keys, - inference_params_.hit_rate_threshold, streams_[i]); - } else { - embedding_cache_->lookup(i, d_embedding_vectors_ + acc_vectors_offset, - static_cast(keys) + acc_keys_offset, num_keys, - inference_params_.hit_rate_threshold, streams_[i]); - } - } - acc_keys_offset += num_keys; - acc_vectors_offset += inference_params_.max_batchsize * - inference_parser_.max_feature_num_for_tables[i] * - inference_parser_.embed_vec_size_for_tables[i]; - } - for (size_t i = 0; i < num_embedding_tables; ++i) { - HCTR_LIB_THROW(cudaStreamSynchronize(streams_[i])); - } - - // convert dense input to dense tensor - // auto dense_dims = dense_input_tensorbag_.get_dimensions(); - // for (auto dim : dense_dims) { - // dense_size *= dim; - // } - size_t dense_size = dense_input_tensorbag_.num_elements(); - if (inference_params_.use_mixed_precision) { - convert_array_on_device( - dense_input_tensorbag_.data<__half>(), d_dense, dense_size, - resource_manager_->get_local_gpu_from_device_id(inference_params_.device_id)->get_stream()); - - } else { - convert_array_on_device( - dense_input_tensorbag_.data(), d_dense, dense_size, - resource_manager_->get_local_gpu_from_device_id(inference_params_.device_id)->get_stream()); - } - - acc_vectors_offset = 0; - acc_row_ptrs_offset = 0; - for (size_t i = 0; i < num_embedding_tables; ++i) { - // bind row ptrs input to row ptrs tensor - (*row_ptrs_tensors_[i]) = - core23::Tensor::bind(d_row_ptrs + acc_row_ptrs_offset, row_ptrs_tensors_[i]->shape(), - row_ptrs_tensors_[i]->data_type(), row_ptrs_tensors_[i]->device()); - acc_row_ptrs_offset += num_samples * inference_parser_.slot_num_for_tables[i] + 1; - - // bind embedding vectors from looking up to embedding features tensor - (*embedding_features_tensors_[i]) = core23::Tensor::bind( - d_embedding_vectors_ + acc_vectors_offset, embedding_features_tensors_[i]->shape(), - embedding_features_tensors_[i]->data_type(), embedding_features_tensors_[i]->device()); - acc_vectors_offset += inference_params_.max_batchsize * - inference_parser_.max_feature_num_for_tables[i] * - inference_parser_.embed_vec_size_for_tables[i]; - // feature combiner feedforward - embedding_feature_combiners_[i]->fprop(false); - } - - // dense network feedforward - - if (inference_params_.use_cuda_graph) { - predict_network_pipeline_.run_graph(); - } else { - predict_network_pipeline_.run(); - } - - // convert the prediction result to output - if (inference_params_.use_mixed_precision) { - convert_array_on_device( - d_output, network_->get_pred_tensor_half().get_ptr(), - network_->get_pred_tensor_half().get_num_elements(), - resource_manager_->get_local_gpu_from_device_id(inference_params_.device_id)->get_stream()); - } else { - convert_array_on_device( - d_output, network_->get_pred_tensor().get_ptr(), - network_->get_pred_tensor().get_num_elements(), - resource_manager_->get_local_gpu_from_device_id(inference_params_.device_id)->get_stream()); - } - HCTR_LIB_THROW(cudaStreamSynchronize( - resource_manager_->get_local_gpu_from_device_id(inference_params_.device_id)->get_stream())); -} - -void InferenceSession::predict(float* d_dense, void* h_embeddingcolumns, int* d_row_ptrs, - float* d_output, int num_samples, bool table_major_key_layout) { - size_t num_embedding_tables = inference_parser_.num_embedding_tables; - if (num_embedding_tables != row_ptrs_tensors_.size() || - num_embedding_tables != embedding_features_tensors_.size() || - num_embedding_tables != embedding_feature_combiners_.size()) { - HCTR_OWN_THROW(Error_t::IllegalCall, "embedding feature combiner inconsistent"); - } - // Copy row_ptrs to host - HCTR_LIB_THROW(cudaMemcpy( - h_row_ptrs_, d_row_ptrs, - (num_samples * inference_parser_.slot_num + inference_parser_.num_embedding_tables) * - sizeof(int), - cudaMemcpyDeviceToHost)); - - // Redistribute keys :from sample first to table first - if (!table_major_key_layout) { - // HCTR_LOG_S(INFO, ROOT) << "Redistribute keys from sample first to table first" << std::endl; - if (inference_params_.i64_input_key) { - distribute_keys_per_table(static_cast(h_keys_), - static_cast(h_embeddingcolumns), h_row_ptrs_, - num_samples, inference_parser_.slot_num_for_tables); - } else { - distribute_keys_per_table(static_cast(h_keys_), - static_cast(h_embeddingcolumns), h_row_ptrs_, - num_samples, inference_parser_.slot_num_for_tables); - } - } - void* h_keys_for_ec = table_major_key_layout ? h_embeddingcolumns : h_keys_; - predict_impl(d_dense, h_keys_for_ec, false, d_row_ptrs, d_output, num_samples, - num_embedding_tables, table_major_key_layout); -} - -void InferenceSession::predict_from_device(float* d_dense, void* d_embeddingcolumns, - int* d_row_ptrs, float* d_output, int num_samples, - bool table_major_key_layout) { - size_t num_embedding_tables = inference_parser_.num_embedding_tables; - if (num_embedding_tables != row_ptrs_tensors_.size() || - num_embedding_tables != embedding_features_tensors_.size() || - num_embedding_tables != embedding_feature_combiners_.size()) { - HCTR_OWN_THROW(Error_t::IllegalCall, "embedding feature combiner inconsistent"); - } - CudaDeviceContext context( - resource_manager_->get_local_gpu_from_device_id(inference_params_.device_id) - ->get_device_id()); - - cudaStream_t stream = - resource_manager_->get_local_gpu_from_device_id(inference_params_.device_id)->get_stream(); - - // Copy row_ptrs to host - HCTR_LIB_THROW(cudaMemcpy( - h_row_ptrs_, d_row_ptrs, - (num_samples * inference_parser_.slot_num + inference_parser_.num_embedding_tables) * - sizeof(int), - cudaMemcpyDeviceToHost)); - // Redistribute keys :from sample first to table first - if (!table_major_key_layout) { - // HCTR_LOG_S(INFO, ROOT) << "Redistribute keys from sample first to table first" << std::endl; - if (inference_params_.i64_input_key) { - distribute_keys_per_table_on_device( - static_cast(d_keys_), static_cast(d_embeddingcolumns), d_row_ptrs, - num_samples, inference_parser_.slot_num_for_tables, stream); - } else { - distribute_keys_per_table_on_device( - static_cast(d_keys_), static_cast(d_embeddingcolumns), - d_row_ptrs, num_samples, inference_parser_.slot_num_for_tables, stream); - } - } - - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - - void* d_keys_for_ec = table_major_key_layout ? d_embeddingcolumns : d_keys_; - predict_impl(d_dense, d_keys_for_ec, true, d_row_ptrs, d_output, num_samples, - num_embedding_tables, table_major_key_layout); -} - -} // namespace HugeCTR diff --git a/HugeCTR/src/inference_benchmark/metrics.cpp b/HugeCTR/src/inference_benchmark/metrics.cpp index 981ef06bb2..f0fd4b1513 100644 --- a/HugeCTR/src/inference_benchmark/metrics.cpp +++ b/HugeCTR/src/inference_benchmark/metrics.cpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/HugeCTR/src/network.cpp b/HugeCTR/src/network.cpp index 0076237127..5efc09b6df 100644 --- a/HugeCTR/src/network.cpp +++ b/HugeCTR/src/network.cpp @@ -177,49 +177,6 @@ void Network::upload_params_to_device(const std::string& model_file) { return; } -void Network::upload_params_to_device_inference(const std::string& model_file) { - auto fs = FileSystemBuilder::build_unique_by_path(model_file); - CudaDeviceContext context(get_device_id()); - - std::unique_ptr params(new char[evaluate_weight_tensor_.get_size_in_bytes()]); - fs->read(model_file, params.get(), evaluate_weight_tensor_.get_size_in_bytes(), 0); - HCTR_LIB_THROW(cudaMemcpyAsync(evaluate_weight_tensor_.get_ptr(), params.get(), - evaluate_weight_tensor_.get_size_in_bytes(), - cudaMemcpyHostToDevice, gpu_resource_->get_stream())); - if (use_mixed_precision_) { - conv_weight_(evaluate_weight_tensor_half_, evaluate_weight_tensor_); - } - return; -} - -void Network::upload_non_trainable_params_to_device_inference(const std::string& model_file) { - HCTR_LOG(INFO, ROOT, "Upload non-trainable parameters from JSON file to inference layers\n"); - const nlohmann::json& params_json(read_json_file(model_file)); - const nlohmann::json& params_for_layers = get_json(params_json, "layers"); - size_t counter = 0; - CudaDeviceContext context(get_device_id()); - for (size_t i{0}; i < evaluate_layers_.size(); ++i) { - auto params_tensors = evaluate_layers_[i]->get_tensors_for_non_trainable_params(); - if (params_tensors.size() > 1) { - const nlohmann::json& params = params_for_layers[counter]; - std::string layer_type = get_value_from_json(params, "type"); - if (layer_type == "BatchNorm") { - std::vector running_mean = get_json(params, "mean"); - std::vector running_variance = get_json(params, "var"); - HCTR_LIB_THROW(cudaMemcpyAsync(params_tensors[0].get_ptr(), running_mean.data(), - params_tensors[0].get_size_in_bytes(), - cudaMemcpyHostToDevice, gpu_resource_->get_stream())); - HCTR_LIB_THROW(cudaMemcpyAsync(params_tensors[1].get_ptr(), running_variance.data(), - params_tensors[1].get_size_in_bytes(), - cudaMemcpyHostToDevice, gpu_resource_->get_stream())); - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "Only BatchNorm layer has non-trainable parameters"); - } - ++counter; - } - } -} - void Network::download_params_to_host(float* weight) { CudaDeviceContext context(get_device_id()); diff --git a/HugeCTR/src/optimizers/sparse_optimizer.cu b/HugeCTR/src/optimizers/sparse_optimizer.cu index 337fda0926..a8d698a294 100644 --- a/HugeCTR/src/optimizers/sparse_optimizer.cu +++ b/HugeCTR/src/optimizers/sparse_optimizer.cu @@ -28,7 +28,6 @@ EmbeddingOptimizer::EmbeddingOptimizer( const std::shared_ptr> &buf) : param(param) { // new optimizer params used by update_params - // should be match with HugeCTR/src/parsers/create_embedding.cpp // should be match with HugeCTR/src/pybind/model.cpp switch (param.opt_params.optimizer) { case Optimizer_t::Ftrl: diff --git a/HugeCTR/src/parsers/create_datareader.cpp b/HugeCTR/src/parsers/create_datareader.cpp deleted file mode 100644 index 86d02da7b0..0000000000 --- a/HugeCTR/src/parsers/create_datareader.cpp +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -#ifdef ENABLE_MPI -#include -#endif - -namespace HugeCTR { -// Create data reader for InferenceSession (internal use) -template -void create_datareader::operator()( - const InferenceParams& inference_params, const InferenceParser& inference_parser, - std::shared_ptr& data_reader, - const std::shared_ptr resource_manager, - std::map>& sparse_input_map, - std::map& label_dense_map, const std::string& source, - const DataReaderType_t data_reader_type, const Check_t check_type, - const std::vector& slot_size_array, const bool repeat_dataset, - const long long num_samples, const DataSourceParams& data_source_params) { - // TO DO:support multi-hot - long long slot_sum = 0; - std::vector slot_offset; - for (auto slot_size : slot_size_array) { - slot_offset.push_back(slot_sum); - slot_sum += slot_size; - } - - std::vector data_reader_sparse_param_array; - for (size_t i = 0; i < inference_parser.slot_num_for_tables.size(); i++) { - data_reader_sparse_param_array.emplace_back(inference_parser.sparse_names[i], - inference_parser.max_nnz_for_tables[i], false, - inference_parser.slot_num_for_tables[i]); - } - - for (unsigned int i = 0; i < inference_parser.sparse_names.size(); i++) { - DataReaderSparseParam param = data_reader_sparse_param_array[i]; - std::string sparse_name = inference_parser.sparse_names[i]; - core23_reader::SparseInput sparse_input(param.slot_num, param.max_feature_num); - sparse_input_map.emplace(sparse_name, sparse_input); - } - - core23_reader::DataReader* data_reader_tk = new core23_reader::DataReader( - inference_params.max_batchsize, inference_parser.label_dim, inference_parser.dense_dim, - data_reader_sparse_param_array, resource_manager, true, 1, false, data_source_params); - data_reader.reset(data_reader_tk); - - switch (data_reader_type) { - case DataReaderType_t::Norm: { - bool start_right_now = repeat_dataset; - data_reader->create_drwg_norm(source, check_type, start_right_now); - break; - } - case DataReaderType_t::Raw: { - data_reader->create_drwg_raw(source, num_samples, false, false, true); - break; - } - case DataReaderType_t::Parquet: { -#ifdef DISABLE_CUDF - HCTR_OWN_THROW(Error_t::WrongInput, "Parquet is not supported under DISABLE_CUDF"); -#else - std::shared_ptr parquet_meta = std::make_shared(); - auto get_meta_path = [&](std::string one_parquet_file_path) -> std::string { - std::size_t found = one_parquet_file_path.find_last_of("/\\"); - std::string metadata_path = one_parquet_file_path.substr(0, found); - metadata_path.append("/_metadata.json"); - return metadata_path; - }; - std::string first_file_name, buff; - std::string metadata_path; - std::ifstream read_stream(source, std::ifstream::in); - if (!read_stream.is_open()) { - HCTR_OWN_THROW(Error_t::FileCannotOpen, "file list open failed: " + source); - } - std::getline(read_stream, buff); - int num_of_files = std::stoi(buff); - if (num_of_files) { - std::getline(read_stream, first_file_name); - metadata_path = get_meta_path(first_file_name); - } - parquet_meta->reset_metadata(metadata_path); - auto parquet_eval_max_row_group_size = parquet_meta->get_max_row_group(); - auto parquet_label_cols = parquet_meta->get_label_names().size(); - auto parquet_dense_cols = parquet_meta->get_cont_names().size(); - read_stream.close(); - HCTR_LOG(INFO, WORLD, "parquet_eval_max_row_group_size %lld\n", - parquet_eval_max_row_group_size); - data_reader->create_drwg_parquet(source, false, slot_offset, true, - parquet_eval_max_row_group_size, - parquet_dense_cols + parquet_label_cols, - inference_parser.dense_dim + inference_parser.label_dim); -#endif - break; - } - default: { - assert(!"Error: no such option && should never get here!"); - } - } - - label_dense_map.emplace(inference_parser.label_name, data_reader_tk->get_label_tensor23s()[0]); - label_dense_map.emplace(inference_parser.dense_name, data_reader_tk->get_dense_tensor23s()[0]); - - for (unsigned int i = 0; i < inference_parser.sparse_names.size(); i++) { - const std::string& sparse_name = inference_parser.sparse_names[i]; - const auto& sparse_input = sparse_input_map.find(sparse_name); - - sparse_input->second.evaluate_sparse_tensors = - data_reader_tk->get_sparse_tensor23s(sparse_name); - } -} - -// Create data reader for InferenceModel (multi-GPU offline inference use) -template -void create_datareader::operator()( - const InferenceParams& inference_params, const InferenceParser& inference_parser, - std::shared_ptr& data_reader, - const std::shared_ptr resource_manager, - std::map>& sparse_input_map, - std::vector& label_tensor_list, std::vector& dense_tensor_list, - const std::string& source, const DataReaderType_t data_reader_type, const Check_t check_type, - const std::vector& slot_size_array, const bool repeat_dataset, - const DataSourceParams& data_source_params, bool read_file_seq) { - HCTR_CHECK_HINT(label_tensor_list.size() == 0, - "label tensor list should be empty before creating data reader"); - HCTR_CHECK_HINT(dense_tensor_list.size() == 0, - "dense tensor list should be empty before creating data reader"); - HCTR_CHECK_HINT(repeat_dataset, "repeat dataset should be true for inference"); - HCTR_LOG_S(INFO, ROOT) << "Create inference data reader on " - << resource_manager->get_local_gpu_count() << " GPU(s)" << std::endl; - long long slot_sum = 0; - std::vector slot_offset; - for (auto slot_size : slot_size_array) { - slot_offset.push_back(slot_sum); - slot_sum += slot_size; - } - - std::vector data_reader_sparse_param_array; - for (size_t i = 0; i < inference_parser.slot_num_for_tables.size(); i++) { - data_reader_sparse_param_array.emplace_back(inference_parser.sparse_names[i], - inference_parser.max_nnz_for_tables[i], false, - inference_parser.slot_num_for_tables[i]); - } - - for (unsigned int i = 0; i < inference_parser.sparse_names.size(); i++) { - DataReaderSparseParam param = data_reader_sparse_param_array[i]; - std::string sparse_name = inference_parser.sparse_names[i]; - core23_reader::SparseInput sparse_input(param.slot_num, param.max_feature_num); - sparse_input_map.emplace(sparse_name, sparse_input); - } - - // For Norm, there should be only one worker to ensure the correct prediction order - const int num_workers = - data_reader_type == DataReaderType_t::Parquet ? resource_manager->get_local_gpu_count() : 1; - HCTR_LOG_S(INFO, ROOT) << "num of DataReader workers: " << num_workers << std::endl; - - core23_reader::DataReader* data_reader_tk = new core23_reader::DataReader( - inference_params.max_batchsize, inference_parser.label_dim, inference_parser.dense_dim, - data_reader_sparse_param_array, resource_manager, repeat_dataset, num_workers, false, - data_source_params); // use_mixed_precision = false - data_reader.reset(data_reader_tk); - - switch (data_reader_type) { - case DataReaderType_t::Norm: { - bool start_right_now = repeat_dataset; - data_reader->create_drwg_norm(source, check_type, start_right_now); - break; - } - case DataReaderType_t::Parquet: { -#ifdef DISABLE_CUDF - HCTR_OWN_THROW(Error_t::WrongInput, "Parquet is not supported under DISABLE_CUDF"); -#else - std::shared_ptr parquet_meta = std::make_shared(); - auto get_meta_path = [&](std::string one_parquet_file_path) -> std::string { - std::size_t found = one_parquet_file_path.find_last_of("/\\"); - std::string metadata_path = one_parquet_file_path.substr(0, found); - metadata_path.append("/_metadata.json"); - return metadata_path; - }; - std::string first_file_name, buff; - std::string metadata_path; - std::ifstream read_stream(source, std::ifstream::in); - if (!read_stream.is_open()) { - HCTR_OWN_THROW(Error_t::FileCannotOpen, "file list open failed: " + source); - } - std::getline(read_stream, buff); - int num_of_files = std::stoi(buff); - if (num_of_files) { - std::getline(read_stream, first_file_name); - metadata_path = get_meta_path(first_file_name); - } - parquet_meta->reset_metadata(metadata_path); - auto parquet_eval_max_row_group_size = parquet_meta->get_max_row_group(); - auto parquet_label_cols = parquet_meta->get_label_names().size(); - auto parquet_dense_cols = parquet_meta->get_cont_names().size(); - read_stream.close(); - HCTR_LOG(INFO, WORLD, "parquet_eval_max_row_group_size %lld\n", - parquet_eval_max_row_group_size); - data_reader->create_drwg_parquet(source, read_file_seq, slot_offset, true, - parquet_eval_max_row_group_size, - parquet_dense_cols + parquet_label_cols, - inference_parser.dense_dim + inference_parser.label_dim); -#endif - break; - } - default: { - assert(!"Error: no such option && should never get here!"); - } - } - - for (size_t i = 0; i < resource_manager->get_local_gpu_count(); i++) { - label_tensor_list.push_back(data_reader_tk->get_label_tensor23s()[i]); - dense_tensor_list.push_back(data_reader_tk->get_dense_tensor23s()[i]); - } - - for (unsigned int i = 0; i < inference_parser.sparse_names.size(); i++) { - const std::string& sparse_name = inference_parser.sparse_names[i]; - const auto& sparse_input = sparse_input_map.find(sparse_name); - sparse_input->second.evaluate_sparse_tensors = - data_reader_tk->get_sparse_tensor23s(sparse_name); - } -} - -template struct create_datareader; -template struct create_datareader; - -} // namespace HugeCTR diff --git a/HugeCTR/src/parsers/create_network.cpp b/HugeCTR/src/parsers/create_network.cpp deleted file mode 100644 index e3835e1e63..0000000000 --- a/HugeCTR/src/parsers/create_network.cpp +++ /dev/null @@ -1,1696 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef ENABLE_MPI -#include -#endif - -namespace HugeCTR { -struct InputOutputInfo { - std::vector inputs; - std::vector output_names; -}; - -static bool get_tensor_from_entries(const std::vector tensor_entries, - const std::string& name, TensorBag2* bag) { - for (const TensorEntry& entry : tensor_entries) { - if (entry.name == name) { - *bag = entry.bag; - return true; - } - } - return false; -} - -static InputOutputInfo get_input_tensor_and_output_name( - const nlohmann::json& json, const std::vector& tensor_entries) { - auto bottom = get_json(json, "bottom"); - auto top = get_json(json, "top"); - - std::vector bottom_names = get_layer_names(bottom); - std::vector top_names = get_layer_names(top); - - std::vector bottom_bags; - - for (auto& bottom_name : bottom_names) { - for (auto& top_name : top_names) { - if (bottom_name == top_name) { - HCTR_OWN_THROW(Error_t::WrongInput, "bottom and top include a same layer name"); - } - } - TensorBag2 bag; - if (!get_tensor_from_entries(tensor_entries, bottom_name, &bag)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such bottom: " + bottom_name); - } - bottom_bags.push_back(bag); - } - return {bottom_bags, top_names}; -} - -template -static std::shared_ptr> create_regularizer( - const nlohmann::json& j, const Tensor2& weight_buff, const Tensor2& wgrad_buff, - const int batch_size, const std::shared_ptr& gpu_resource) { - std::shared_ptr> reg( - new NoRegularizer(weight_buff, wgrad_buff, batch_size, gpu_resource)); - auto reg_it = j.find("regularizer"); - if (reg_it != j.end()) { - Regularizer_t reg_type = Regularizer_t::None; - auto reg_name = reg_it->get(); - if (!find_item_in_map(reg_type, reg_name, REGULARIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such regularizer: " + reg_name); - } - switch (reg_type) { - case Regularizer_t::L1: { - const auto lambda = get_value_from_json(j, "lambda"); - reg.reset(new L1Regularizer(weight_buff, wgrad_buff, batch_size, lambda, gpu_resource)); - break; - } - case Regularizer_t::L2: { - const auto lambda = get_value_from_json(j, "lambda"); - reg.reset(new L2Regularizer(weight_buff, wgrad_buff, batch_size, lambda, gpu_resource)); - break; - } - default: { - assert(!"Error: no such regularizer!"); - } - } - } - return reg; -} - -void create_layers(const nlohmann::json& j_array, std::vector& tensor_entries, - const std::shared_ptr>& blobs_buff, - const std::shared_ptr>& weight_buff, - const std::shared_ptr>& weight_buff_half, - const std::shared_ptr>& wgrad_buff, - const std::shared_ptr>& wgrad_buff_half, - std::map>& loss_tensors, - const std::shared_ptr& gpu_resource, bool use_mixed_precision, - bool enable_tf32_compute, int num_networks_in_global, float scaler, - bool inference_flag, std::vector>& layers, - std::map>& losses, - metrics::MultiLossMetricMap* raw_metrics, - std::vector* top_layers = nullptr, - std::vector* bottom_layers = nullptr) { - std::vector multi_task_output_tensor_entries; - - bool skip_dgrad = true; - bool is_bottom_mlp = true; - - auto emplaceback_layer = [&is_bottom_mlp, &layers, &bottom_layers, &top_layers](Layer* layer) { - if (is_bottom_mlp) { - if (bottom_layers) { - bottom_layers->emplace_back(layer); - } - } else { - if (top_layers) { - top_layers->emplace_back(layer); - } - } - layers.emplace_back(layer); - }; - - for (unsigned int i = 1; i < j_array.size(); i++) { - const nlohmann::json& j = j_array[i]; - const auto layer_type_name = get_value_from_json(j, "type"); - Layer_t layer_type; - - const auto& layer_map = use_mixed_precision ? LAYER_TYPE_MAP_MP : LAYER_TYPE_MAP; - - if (!find_item_in_map(layer_type, layer_type_name, layer_map)) { - Embedding_t embedding_type; - if (!find_item_in_map(embedding_type, layer_type_name, EMBEDDING_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such layer: " + layer_type_name); - } - continue; - } - - // TODO: to make it generalized, we should not assume that the bottom name - // includes "embedding". We need a better way to analyze such dependencies. - auto bottom = get_json(j, "bottom"); - std::vector bottom_strs = get_layer_names(bottom); - for (const std::string& str : bottom_strs) { - if (str.find("embedding") != std::string::npos) { - is_bottom_mlp = false; - } - } - - std::vector output_tensor_entries; - auto input_output_info = get_input_tensor_and_output_name(j, tensor_entries); - switch (layer_type) { - case Layer_t::BatchNorm: { - // get BN params - auto j_bn_hparam = get_json(j, "bn_param"); - auto factor = get_value_from_json(j_bn_hparam, "factor"); - auto eps = get_value_from_json(j_bn_hparam, "eps"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_bn_hparam, "gamma_init")) { - const auto gamma_init_name = get_value_from_json(j_bn_hparam, "gamma_init"); - Initializer_t gamma_init_type; - if (!find_item_in_map(gamma_init_type, gamma_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + gamma_init_name); - } else { - initializer_types[0] = gamma_init_type; - } - } - if (has_key_(j_bn_hparam, "beta_init")) { - const auto beta_init_name = get_value_from_json(j_bn_hparam, "beta_init"); - Initializer_t beta_init_type; - if (!find_item_in_map(beta_init_type, beta_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + beta_init_name); - } else { - initializer_types[1] = beta_init_type; - } - } - - if (use_mixed_precision) { - Tensor2<__half> bn_in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - // establish out tensor - Tensor2<__half> bn_out_tensor; - blobs_buff->reserve(bn_in_tensor.get_dimensions(), &bn_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], bn_out_tensor.shrink()}); - - BatchNormLayer<__half>::Params params = {factor, eps}; - emplaceback_layer(new BatchNormLayer<__half>(weight_buff, weight_buff, wgrad_buff, - blobs_buff, bn_in_tensor, bn_out_tensor, - params, gpu_resource, initializer_types)); - } else { - Tensor2 bn_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - // establish out tensor - Tensor2 bn_out_tensor; - blobs_buff->reserve(bn_in_tensor.get_dimensions(), &bn_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], bn_out_tensor.shrink()}); - - BatchNormLayer::Params params = {factor, eps}; - emplaceback_layer(new BatchNormLayer(weight_buff, weight_buff, wgrad_buff, - blobs_buff, bn_in_tensor, bn_out_tensor, - params, gpu_resource, initializer_types)); - } - - break; - } - case Layer_t::LayerNorm: { - // get LN params - auto j_ln_hparam = get_json(j, "ln_param"); - auto eps = get_value_from_json(j_ln_hparam, "eps"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_ln_hparam, "gamma_init")) { - const auto gamma_init_name = get_value_from_json(j_ln_hparam, "gamma_init"); - Initializer_t gamma_init_type; - if (!find_item_in_map(gamma_init_type, gamma_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + gamma_init_name); - } else { - initializer_types[0] = gamma_init_type; - } - } - if (has_key_(j_ln_hparam, "beta_init")) { - const auto beta_init_name = get_value_from_json(j_ln_hparam, "beta_init"); - Initializer_t beta_init_type; - if (!find_item_in_map(beta_init_type, beta_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + beta_init_name); - } else { - initializer_types[1] = beta_init_type; - } - } - - if (use_mixed_precision) { - Tensor2<__half> ln_in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - // establish out tensor - Tensor2<__half> ln_out_tensor; - blobs_buff->reserve(ln_in_tensor.get_dimensions(), &ln_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], ln_out_tensor.shrink()}); - - LayerNormLayer<__half>::Params params = {eps}; - emplaceback_layer(new LayerNormLayer<__half>( - weight_buff, weight_buff_half, wgrad_buff_half, blobs_buff, ln_in_tensor, - ln_out_tensor, params, gpu_resource, initializer_types)); - } else { - Tensor2 ln_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - // establish out tensor - Tensor2 ln_out_tensor; - blobs_buff->reserve(ln_in_tensor.get_dimensions(), &ln_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], ln_out_tensor.shrink()}); - - LayerNormLayer::Params params = {eps}; - emplaceback_layer(new LayerNormLayer(weight_buff, weight_buff, wgrad_buff, - blobs_buff, ln_in_tensor, ln_out_tensor, - params, gpu_resource, initializer_types)); - } - - break; - } - case Layer_t::BinaryCrossEntropyLoss: { - if (input_output_info.inputs.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "bottom of BinaryCrossEntropyLoss must be two dim"); - } - if (inference_flag) { - HCTR_LOG( - INFO, ROOT, - "Inference stage skip BinaryCrossEntropyLoss layer, replaced by Sigmoid layer\n"); - if (use_mixed_precision) { - Tensor2<__half> sigmoid_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> sigmoid_out_tensor; - blobs_buff->reserve(sigmoid_in_tensor.get_dimensions(), &sigmoid_out_tensor); - emplaceback_layer( - new SigmoidLayer<__half>(sigmoid_in_tensor, sigmoid_out_tensor, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - multi_task_output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - } else { - // establish out tensor - Tensor2 sigmoid_in_tensor = - Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 sigmoid_out_tensor; - blobs_buff->reserve(sigmoid_in_tensor.get_dimensions(), &sigmoid_out_tensor); - emplaceback_layer( - new SigmoidLayer(sigmoid_in_tensor, sigmoid_out_tensor, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - multi_task_output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - } - break; - } - Tensor2 label_tensor = Tensor2::stretch_from(input_output_info.inputs[1]); - - // create new loss tensor - auto name = input_output_info.output_names[0]; - Tensor2 new_loss_tensor; - blobs_buff->reserve({1, 1}, &new_loss_tensor); - - // create new loss item - std::unique_ptr new_loss; - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - - new_loss.reset(new BinaryCrossEntropyLoss<__half>( - label_tensor, in_tensor, new_loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff_half->as_tensor(), - in_tensor.get_dimensions()[0], gpu_resource), - gpu_resource, num_networks_in_global, scaler)); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - - new_loss.reset(new BinaryCrossEntropyLoss( - label_tensor, in_tensor, new_loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff->as_tensor(), - in_tensor.get_dimensions()[0], gpu_resource), - gpu_resource, num_networks_in_global, scaler)); - } - loss_tensors.insert(std::pair(name, new_loss_tensor)); - losses.insert(std::pair(name, std::move(new_loss))); - break; - } - case Layer_t::Concat: { - auto axis_it = j.find("axis"); - auto axis = (axis_it != j.end()) ? axis_it->get() : 1; - if (use_mixed_precision) { - Tensors2<__half> in_tensors; - for (const TensorBag2& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2<__half>::stretch_from(bag)); - } - Tensor2<__half> out_tensor; - if (in_tensors[0].get_dimensions().size() == 2) { - emplaceback_layer( - new ConcatLayer<__half>(in_tensors, out_tensor, blobs_buff, gpu_resource)); - } - if (in_tensors[0].get_dimensions().size() == 3) { - emplaceback_layer( - new Concat3DLayer<__half>(in_tensors, out_tensor, blobs_buff, axis, gpu_resource)); - } - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensors2 in_tensors; - for (const TensorBag2& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2::stretch_from(bag)); - } - Tensor2 out_tensor; - if (in_tensors[0].get_dimensions().size() == 2) { - emplaceback_layer( - new ConcatLayer(in_tensors, out_tensor, blobs_buff, gpu_resource)); - } - if (in_tensors[0].get_dimensions().size() == 3) { - emplaceback_layer( - new Concat3DLayer(in_tensors, out_tensor, blobs_buff, axis, gpu_resource)); - } - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - case Layer_t::CrossEntropyLoss: { - if (input_output_info.inputs.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "bottom of CrossEntropyLoss must be two dim"); - } - if (inference_flag) { - HCTR_LOG(INFO, ROOT, - "Inference stage skip CrossEntropyLoss layer, replaced by Softmax layer\n"); - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); - emplaceback_layer( - new SoftmaxLayer<__half>(in_tensor, out_tensor, blobs_buff, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - multi_task_output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); - emplaceback_layer( - new SoftmaxLayer(in_tensor, out_tensor, blobs_buff, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - multi_task_output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - Tensor2 label_tensor = Tensor2::stretch_from(input_output_info.inputs[1]); - // create new loss tensor - auto name = input_output_info.output_names[0]; - Tensor2 new_loss_tensor; - blobs_buff->reserve({1, 1}, &new_loss_tensor); - - // create new loss item - std::unique_ptr new_loss; - - if (use_mixed_precision) { - Tensor2<__half> cross_entropy_loss_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - - new_loss.reset(new CrossEntropyLoss<__half>( - label_tensor, cross_entropy_loss_in_tensor, new_loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff_half->as_tensor(), - cross_entropy_loss_in_tensor.get_dimensions()[0], gpu_resource), - gpu_resource, num_networks_in_global, scaler)); - } else { - Tensor2 cross_entropy_loss_in_tensor = - Tensor2::stretch_from(input_output_info.inputs[0]); - - new_loss.reset(new CrossEntropyLoss( - label_tensor, cross_entropy_loss_in_tensor, new_loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff->as_tensor(), - cross_entropy_loss_in_tensor.get_dimensions()[0], gpu_resource), - gpu_resource, num_networks_in_global, scaler)); - } - loss_tensors.insert(std::pair(name, new_loss_tensor)); - losses.insert(std::pair(name, std::move(new_loss))); - break; - } - case Layer_t::Dropout: { - if (use_mixed_precision) { - Tensor2<__half> do_in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - // establish out tensor - Tensor2<__half> do_out_tensor; - blobs_buff->reserve(do_in_tensor.get_dimensions(), &do_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], do_out_tensor.shrink()}); - // get ELU params - auto rate_it = j.find("rate"); - auto rate = (rate_it != j.end()) ? rate_it->get() : 0.5f; - emplaceback_layer(new DropoutLayer<__half>(do_in_tensor, do_out_tensor, blobs_buff, rate, - gpu_resource)); - } else { - // establish out tensor - Tensor2 do_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 do_out_tensor; - blobs_buff->reserve(do_in_tensor.get_dimensions(), &do_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], do_out_tensor.shrink()}); - // get ELU params - auto rate_it = j.find("rate"); - auto rate = (rate_it != j.end()) ? rate_it->get() : 0.5f; - emplaceback_layer( - new DropoutLayer(do_in_tensor, do_out_tensor, blobs_buff, rate, gpu_resource)); - } - - break; - } - case Layer_t::SequenceMask: { - if (use_mixed_precision) { - Tensors2<__half> smask_in_tensors; - Tensor2<__half> smask_from_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - smask_in_tensors.push_back(smask_from_tensor); - Tensor2<__half> smask_to_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[1]); - smask_in_tensors.push_back(smask_to_tensor); - Tensor2<__half> smask_out_tensor; - auto max_sequence_len_from = get_json(j, "max_sequence_len_from"); - auto max_sequence_len_to = get_json(j, "max_sequence_len_to"); - blobs_buff->reserve({smask_from_tensor.get_dimensions()[0], 1, max_sequence_len_from, - max_sequence_len_to}, - &smask_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], smask_out_tensor.shrink()}); - emplaceback_layer(new SequenceMaskLayer<__half>( - smask_in_tensors, smask_out_tensor, max_sequence_len_from, max_sequence_len_to, - blobs_buff, gpu_resource)); - } else { - Tensors2 smask_in_tensors; - Tensor2 smask_from_tensor = - Tensor2::stretch_from(input_output_info.inputs[0]); - smask_in_tensors.push_back(smask_from_tensor); - Tensor2 smask_to_tensor = - Tensor2::stretch_from(input_output_info.inputs[1]); - smask_in_tensors.push_back(smask_to_tensor); - Tensor2 smask_out_tensor; - auto max_sequence_len_from = get_json(j, "max_sequence_len_from"); - auto max_sequence_len_to = get_json(j, "max_sequence_len_to"); - blobs_buff->reserve({smask_from_tensor.get_dimensions()[0], 1, max_sequence_len_from, - max_sequence_len_to}, - &smask_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], smask_out_tensor.shrink()}); - emplaceback_layer(new SequenceMaskLayer(smask_in_tensors, smask_out_tensor, - max_sequence_len_from, max_sequence_len_to, - blobs_buff, gpu_resource)); - } - break; - } - case Layer_t::ELU: { - if (use_mixed_precision) { - Tensor2<__half> elu_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - - // establish out tensor - Tensor2<__half> elu_out_tensor; - blobs_buff->reserve(elu_in_tensor.get_dimensions(), &elu_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], elu_out_tensor.shrink()}); - // get ELU params - auto j_elu_hparam = get_json(j, "elu_param"); - auto alpha = get_value_from_json(j_elu_hparam, "alpha"); - emplaceback_layer( - new EluLayer<__half>(elu_in_tensor, elu_out_tensor, alpha, gpu_resource)); - - } else { - Tensor2 elu_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - - // establish out tensor - Tensor2 elu_out_tensor; - blobs_buff->reserve(elu_in_tensor.get_dimensions(), &elu_out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], elu_out_tensor.shrink()}); - // get ELU params - auto j_elu_hparam = get_json(j, "elu_param"); - auto alpha = get_value_from_json(j_elu_hparam, "alpha"); - emplaceback_layer( - new EluLayer(elu_in_tensor, elu_out_tensor, alpha, gpu_resource)); - } - break; - } - - case Layer_t::FusedInnerProduct: { - auto j_fc_param = get_json(j, "fc_param"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_fc_param, "weight_init")) { - const auto weight_init_name = get_value_from_json(j_fc_param, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - if (has_key_(j_fc_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_fc_param, "bias_init"); - Initializer_t bias_init_type; - if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } else { - initializer_types[1] = bias_init_type; - } - } - - // check the position of this layer - FcPosition_t pos_type = FcPosition_t::None; - int input_size = input_output_info.inputs.size(); - int output_size = input_output_info.output_names.size(); - if (has_key_(j, "position")) { - auto pos_str = get_value_from_json(j, "position"); - if (!find_item_in_map(pos_type, pos_str, FCPOSITION_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such position: " + pos_str); - } else if (pos_type == FcPosition_t::Head && input_size == 1 && output_size == 4) { - } else if (pos_type == FcPosition_t::Body && input_size == 4 && output_size == 4) { - } else if (pos_type == FcPosition_t::Tail && input_size == 4 && output_size == 1) { - } else if (pos_type == FcPosition_t::Isolated && input_size == 1 && output_size == 1) { - } else { - HCTR_OWN_THROW( - Error_t::WrongInput, - "The position and dimension of bottom and top layer aren't compatible: " + - layer_type_name); - } - } - - // check the activation function of this layer - Activation_t act_type = Activation_t::Relu; - if (has_key_(j, "activation")) { - auto act_name = get_value_from_json(j, "activation"); - if (!find_item_in_map(act_type, act_name, ACTIVATION_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such activation: " + act_name); - } - if (act_type == Activation_t::None && pos_type != FcPosition_t::Tail) - HCTR_OWN_THROW(Error_t::WrongInput, - "The layer without activation function must be the last layer in MLP."); - } - - // establish out tensor - auto output = get_value_from_json(j_fc_param, "num_output"); - if (use_mixed_precision) { - Tensor2<__half> train_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> mask_in_tensor, dRelu_in_tensor, db_in_tensor; - if (pos_type == FcPosition_t::Body || pos_type == FcPosition_t::Tail) { - mask_in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[1]); - dRelu_in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[2]); - db_in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[3]); - } - Tensor2<__half> train_out_tensor, mask_out_tensor, dRelu_out_tensor, db_out_tensor; - blobs_buff->reserve({(train_in_tensor.get_dimensions())[0], output}, &train_out_tensor); - blobs_buff->reserve({(train_in_tensor.get_dimensions())[0], output}, &mask_out_tensor); - blobs_buff->reserve({(train_in_tensor.get_dimensions())[0], output}, &dRelu_out_tensor); - - // establish layer - if (pos_type == FcPosition_t::None) { - emplaceback_layer(new FusedFullyConnectedLayer( - weight_buff, weight_buff_half, wgrad_buff_half, blobs_buff, train_in_tensor, - train_out_tensor, gpu_resource, initializer_types)); - } else { - emplaceback_layer(new FusedReluBiasFullyConnectedLayer( - weight_buff, weight_buff_half, wgrad_buff_half, blobs_buff, train_in_tensor, - mask_in_tensor, dRelu_in_tensor, db_in_tensor, train_out_tensor, mask_out_tensor, - dRelu_out_tensor, db_out_tensor, gpu_resource, pos_type, act_type, skip_dgrad, - initializer_types)); - } - - if (pos_type == FcPosition_t::Tail || pos_type == FcPosition_t::Isolated || - pos_type == FcPosition_t::None) - output_tensor_entries.push_back( - {input_output_info.output_names[0], train_out_tensor.shrink()}); - else { - output_tensor_entries.push_back( - {input_output_info.output_names[0], train_out_tensor.shrink()}); - output_tensor_entries.push_back( - {input_output_info.output_names[1], mask_out_tensor.shrink()}); - output_tensor_entries.push_back( - {input_output_info.output_names[2], dRelu_out_tensor.shrink()}); - output_tensor_entries.push_back( - {input_output_info.output_names[3], db_out_tensor.shrink()}); - } - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "FusedInnerProduct support half only"); - } - break; - } - - case Layer_t::MLP: { - auto j_mlp_param = get_json(j, "mlp_param"); - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_mlp_param, "weight_init")) { - const auto weight_init_name = - get_value_from_json(j_mlp_param, "weight_init"); - Initializer_t weight_init_type; - if (find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - initializer_types[0] = weight_init_type; - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } - } - if (has_key_(j_mlp_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_mlp_param, "bias_init"); - Initializer_t bias_init_type; - if (find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - initializer_types[1] = bias_init_type; - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } - } - std::vector num_outputs; - if (has_key_(j_mlp_param, "num_outputs")) { - auto nums = get_json(j_mlp_param, "num_outputs"); - assert(nums.is_array()); - for (auto num : nums) { - num_outputs.emplace_back(num.get()); - } - } - bool use_bias = true; - if (has_key_(j_mlp_param, "use_bias")) { - use_bias = get_value_from_json(j_mlp_param, "use_bias"); - } - std::vector biases; - if (has_key_(j_mlp_param, "biases")) { - auto j_biases = get_json(j_mlp_param, "biases"); - assert(j_biases.is_array()); - for (auto bias : j_biases) { - biases.emplace_back(bias.get()); - } - } - if (biases.empty()) { - biases.resize(num_outputs.size(), use_bias); - } - Activation_t act_type = Activation_t::Relu; - if (has_key_(j_mlp_param, "activation")) { - const auto act_name = get_value_from_json(j_mlp_param, "activation"); - if (find_item_in_map(act_type, act_name, ACTIVATION_TYPE_MAP)) { - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "No such activation: " + act_name); - } - } - std::vector acts; - if (has_key_(j_mlp_param, "activations")) { - auto j_acts = get_json(j_mlp_param, "activations"); - assert(j_acts.is_array()); - for (const auto& j_act : j_acts) { - auto act_name = j_act.get(); - Activation_t act_type; - if (find_item_in_map(act_type, act_name, ACTIVATION_TYPE_MAP)) { - acts.emplace_back(act_type); - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "No such activation: " + act_name); - } - } - } - if (acts.empty()) { - acts.resize(num_outputs.size(), act_type); - } - - auto add_mlp = [&](auto type) { - using T = decltype(type); - int input_size = input_output_info.inputs.size(); - int output_size = input_output_info.output_names.size(); - std::vector> in_tensors; - Tensor2 train_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - in_tensors.push_back(train_in_tensor); - if (input_size == 2) { - Tensor2 mask_in_tensor; - mask_in_tensor = Tensor2::stretch_from(input_output_info.inputs[1]); - in_tensors.push_back(mask_in_tensor); - } - Tensors2 train_out_tensors; - size_t batch_size = train_in_tensor.get_dimensions()[0]; - size_t output_dim = *num_outputs.rbegin(); - if (output_size == 1) { - Tensor2 tensor; - blobs_buff->reserve({batch_size, output_dim}, &tensor); - train_out_tensors.push_back(tensor); - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "MLP layer can only have one output."); - } - if constexpr (std::is_same::value) { - emplaceback_layer(new MLPLayer(weight_buff, weight_buff_half, wgrad_buff_half, - blobs_buff, in_tensors, train_out_tensors, num_outputs, - gpu_resource, acts, biases, initializer_types, - skip_dgrad, false, false, enable_tf32_compute)); - } else if constexpr (std::is_same::value) { - emplaceback_layer(new MLPLayer(weight_buff, weight_buff, wgrad_buff, blobs_buff, - in_tensors, train_out_tensors, num_outputs, gpu_resource, - acts, biases, initializer_types, skip_dgrad, false, - false, enable_tf32_compute)); - } - if (output_size == 1) { - output_tensor_entries.push_back( - {input_output_info.output_names[0], train_out_tensors[0].shrink()}); - } - }; - - if (use_mixed_precision) { - __half type{}; - add_mlp(type); - } else { - float type{}; - add_mlp(type); - } - break; - } - - case Layer_t::Cast: { - if (use_mixed_precision) { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - emplaceback_layer(new CastLayer(in_tensor, out_tensor, gpu_resource)); - } else { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - emplaceback_layer(new CastLayer<__half, float>(in_tensor, out_tensor, gpu_resource)); - } - break; - } - - case Layer_t::InnerProduct: { - auto j_fc_param = get_json(j, "fc_param"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_fc_param, "weight_init")) { - const auto weight_init_name = get_value_from_json(j_fc_param, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - if (has_key_(j_fc_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_fc_param, "bias_init"); - Initializer_t bias_init_type; - if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } else { - initializer_types[1] = bias_init_type; - } - } - - // establish out tensor - auto output = get_value_from_json(j_fc_param, "num_output"); - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> fc_out_tensor; - if (in_tensor.get_dimensions().size() == 2) { - blobs_buff->reserve({in_tensor.get_dimensions()[0], output}, &fc_out_tensor); - } else if (in_tensor.get_dimensions().size() == 3) { - blobs_buff->reserve( - {in_tensor.get_dimensions()[0], in_tensor.get_dimensions()[1], output}, - &fc_out_tensor); - } - - // establish layer - emplaceback_layer(new FullyConnectedLayer<__half>( - weight_buff, weight_buff_half, wgrad_buff_half, blobs_buff, in_tensor, fc_out_tensor, - gpu_resource, initializer_types)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], fc_out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 fc_out_tensor; - - if (in_tensor.get_dimensions().size() == 2) { - blobs_buff->reserve({in_tensor.get_dimensions()[0], output}, &fc_out_tensor); - } else if (in_tensor.get_dimensions().size() == 3) { - blobs_buff->reserve( - {in_tensor.get_dimensions()[0], in_tensor.get_dimensions()[1], output}, - &fc_out_tensor); - } - // establish layer - emplaceback_layer(new FullyConnectedLayer( - weight_buff, wgrad_buff, in_tensor, fc_out_tensor, gpu_resource, use_mixed_precision, - enable_tf32_compute, initializer_types)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], fc_out_tensor.shrink()}); - } - break; - } - case Layer_t::MultiHeadAttention: { - if (input_output_info.inputs.size() < 2) { - HCTR_OWN_THROW(Error_t::WrongInput, - "MultiHeadAttentionLayer needs at least two input tensors "); - } - auto num_heads_it = j.find("num_attention_heads"); - auto num_attention_heads = (num_heads_it != j.end()) ? num_heads_it->get() : 1; - auto transpose_b_it = j.find("transpose_b"); - auto transpose_b = (transpose_b_it != j.end()) ? transpose_b_it->get() : 1; - if (use_mixed_precision) { - Tensors2<__half> in_tensors; - for (const TensorBag2& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2<__half>::stretch_from(bag)); - } - Tensors2<__half> out_tensors; - layers.emplace_back(new MultiHeadAttentionLayer<__half>( - in_tensors, out_tensors, blobs_buff, num_attention_heads, transpose_b, gpu_resource, - use_mixed_precision, enable_tf32_compute)); - for (size_t i = 0; i < out_tensors.size(); i++) { - output_tensor_entries.push_back( - {input_output_info.output_names[i], out_tensors[i].shrink()}); - } - } else { - Tensors2 in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2::stretch_from(bag)); - } - Tensors2 out_tensors; - layers.emplace_back(new MultiHeadAttentionLayer( - in_tensors, out_tensors, blobs_buff, num_attention_heads, transpose_b, gpu_resource, - use_mixed_precision, enable_tf32_compute)); - for (size_t i = 0; i < out_tensors.size(); i++) { - output_tensor_entries.push_back( - {input_output_info.output_names[i], out_tensors[i].shrink()}); - } - } - break; - } - case Layer_t::Interaction: { - // TODO: lambda template could be a better solution here, but there's not support in c++11 - if (use_mixed_precision) { - if (gpu_resource->get_cc_major() < 7) { - std::ostringstream os; - os << "InteractionLayer<__half> is not supported in SM " << gpu_resource->get_cc_major() - << '.' << gpu_resource->get_cc_minor(); - HCTR_OWN_THROW(Error_t::WrongInput, os.str()); - } - - Tensor2<__half> in_mlp_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> in_emb_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[1]); - Tensor2<__half> out_tensor; - - emplaceback_layer(new InteractionLayer<__half>( - in_mlp_tensor, in_emb_tensor, out_tensor, - blobs_buff, // todo cannot use this blobs_buff here need half - gpu_resource, use_mixed_precision, enable_tf32_compute)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - - } else { - Tensor2 in_mlp_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 in_emb_tensor = Tensor2::stretch_from(input_output_info.inputs[1]); - Tensor2 out_tensor; - emplaceback_layer(new InteractionLayer(in_mlp_tensor, in_emb_tensor, out_tensor, - blobs_buff, gpu_resource, - use_mixed_precision, enable_tf32_compute)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - - break; - } - case Layer_t::MultiCross: { - auto j_mc_param = get_json(j, "mc_param"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_mc_param, "weight_init")) { - const auto weight_init_name = get_value_from_json(j_mc_param, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - if (has_key_(j_mc_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_mc_param, "bias_init"); - Initializer_t bias_init_type; - if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } else { - initializer_types[1] = bias_init_type; - } - } - - // establish out tensor - auto num_layers = get_value_from_json(j_mc_param, "num_layers"); - auto projection_dim = 0; - try { - projection_dim = get_value_from_json(j_mc_param, "projection_dim"); - } catch (const core23::RuntimeError& rt_err) { - HCTR_LOG(INFO, WORLD, "No projection_dim given, degrade to DCNv1\n"); - } - if (use_mixed_precision) { - Tensor2<__half> mc_in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - blobs_buff->reserve(mc_in_tensor.get_dimensions(), &out_tensor); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - // establish layer - emplaceback_layer(new MultiCrossLayer<__half>( - weight_buff, weight_buff_half, wgrad_buff_half, blobs_buff, mc_in_tensor, out_tensor, - gpu_resource, num_layers, projection_dim, initializer_types)); - } else { - Tensor2 mc_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - blobs_buff->reserve(mc_in_tensor.get_dimensions(), &out_tensor); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - // establish layer - emplaceback_layer(new MultiCrossLayer( - weight_buff, weight_buff, wgrad_buff, blobs_buff, mc_in_tensor, out_tensor, - gpu_resource, num_layers, projection_dim, initializer_types)); - } - break; - } - - case Layer_t::MultiCrossEntropyLoss: { - if (input_output_info.inputs.size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "bottom of MultiCrossEntropyLoss must be two dim"); - } - if (inference_flag) { - HCTR_LOG(INFO, ROOT, - "Inference stage skip MultiCrossEntropyLoss layer, replaced by Sigmoid layer\n"); - if (use_mixed_precision) { - Tensor2<__half> sigmoid_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> sigmoid_out_tensor; - blobs_buff->reserve(sigmoid_in_tensor.get_dimensions(), &sigmoid_out_tensor); - emplaceback_layer( - new SigmoidLayer<__half>(sigmoid_in_tensor, sigmoid_out_tensor, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - multi_task_output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - } else { - // establish out tensor - Tensor2 sigmoid_in_tensor = - Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 sigmoid_out_tensor; - blobs_buff->reserve(sigmoid_in_tensor.get_dimensions(), &sigmoid_out_tensor); - emplaceback_layer( - new SigmoidLayer(sigmoid_in_tensor, sigmoid_out_tensor, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - multi_task_output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - } - break; - } - auto tweight = get_json(j, "target_weight"); - std::vector target_weight_vec; - for (auto tweight_tmp : tweight) { - float tweight_val = tweight_tmp.get(); - target_weight_vec.push_back(tweight_val); - } - - Tensor2 label_tensor = Tensor2::stretch_from(input_output_info.inputs[1]); - // create new loss tensor - auto name = input_output_info.output_names[0]; - Tensor2 new_loss_tensor; - blobs_buff->reserve({1, 1}, &new_loss_tensor); - - // create new loss item - std::unique_ptr new_loss; - - if (use_mixed_precision) { - Tensor2<__half> multi_cross_entropy_loss_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - new_loss.reset(new MultiCrossEntropyLoss<__half>( - label_tensor, multi_cross_entropy_loss_in_tensor, new_loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff_half->as_tensor(), - multi_cross_entropy_loss_in_tensor.get_dimensions()[0], - gpu_resource), - target_weight_vec, gpu_resource, num_networks_in_global, scaler)); - } else { - Tensor2 multi_cross_entropy_loss_in_tensor = - Tensor2::stretch_from(input_output_info.inputs[0]); - new_loss.reset(new MultiCrossEntropyLoss( - label_tensor, multi_cross_entropy_loss_in_tensor, new_loss_tensor, - create_regularizer(j, weight_buff->as_tensor(), wgrad_buff->as_tensor(), - multi_cross_entropy_loss_in_tensor.get_dimensions()[0], - gpu_resource), - target_weight_vec, gpu_resource, num_networks_in_global, scaler)); - } - loss_tensors.insert(std::pair(name, new_loss_tensor)); - losses.insert(std::pair(name, std::move(new_loss))); - break; - } - case Layer_t::ReLU: { - if (use_mixed_precision) { - Tensor2<__half> relu_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> relu_out_tensor; - blobs_buff->reserve(relu_in_tensor.get_dimensions(), &relu_out_tensor); - emplaceback_layer(new ReluLayer<__half>(relu_in_tensor, relu_out_tensor, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], relu_out_tensor.shrink()}); - } else { - // establish out tensor - Tensor2 relu_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 relu_out_tensor; - blobs_buff->reserve(relu_in_tensor.get_dimensions(), &relu_out_tensor); - emplaceback_layer(new ReluLayer(relu_in_tensor, relu_out_tensor, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], relu_out_tensor.shrink()}); - } - - break; - } - case Layer_t::ReduceMean: { - int axis = get_json(j, "axis").get(); - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - emplaceback_layer( - new ReduceMeanLayer(in_tensor, out_tensor, blobs_buff, axis, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - break; - } - case Layer_t::Sub: { - Tensors2 in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2::stretch_from(bag)); - } - Tensor2 out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - emplaceback_layer(new SubLayer(in_tensors, out_tensor, blobs_buff, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - break; - } - case Layer_t::Gather: { - std::vector indices; - auto j_indices = get_json(j, "indices"); - assert(j_indices.is_array()); - for (auto j_index : j_indices) { - indices.emplace_back(int(j_index)); - } - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - emplaceback_layer( - new GatherLayer(in_tensor, out_tensor, blobs_buff, indices, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - - break; - } - case Layer_t::GRU: { - auto j_gru_param = get_json(j, "gru_param"); - // establish initializer - std::vector initializer_types(2, Initializer_t::Default); - if (has_key_(j_gru_param, "weight_init")) { - const auto weight_init_name = - get_value_from_json(j_gru_param, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - if (has_key_(j_gru_param, "bias_init")) { - const auto bias_init_name = get_value_from_json(j_gru_param, "bias_init"); - Initializer_t bias_init_type; - if (!find_item_in_map(bias_init_type, bias_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + bias_init_name); - } else { - initializer_types[1] = bias_init_type; - } - } - - // establish out tensor - auto output = get_value_from_json(j_gru_param, "num_output"); - auto batchsize = get_value_from_json(j_gru_param, "batchsize"); - auto SeqLength = get_value_from_json(j_gru_param, "SeqLength"); - auto embedding_vec_size = get_value_from_json(j_gru_param, "vector_size"); - - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 gru_out_tensor; - blobs_buff->reserve({in_tensor.get_dimensions()[0], output}, &gru_out_tensor); - // establish layer - emplaceback_layer(new GRULayer(weight_buff, wgrad_buff, in_tensor, gru_out_tensor, - output, batchsize, SeqLength, embedding_vec_size, - gpu_resource, initializer_types)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], gru_out_tensor.shrink()}); - - break; - } - case Layer_t::MatrixMultiply: { - Tensors2 in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2::stretch_from(bag)); - } - Tensor2 out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - layers.emplace_back( - new MatrixMultiplyLayer(in_tensors, out_tensor, blobs_buff, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - break; - } - case Layer_t::Softmax: { - if (use_mixed_precision) { - HCTR_OWN_THROW(Error_t::WrongInput, "Softmax layer does not support fp16"); - } else { - if (input_output_info.inputs.size() != 2) { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - emplaceback_layer( - new SoftmaxLayer(in_tensor, out_tensor, blobs_buff, gpu_resource)); - } else if (input_output_info.inputs.size() == 2) { - auto scale_factor = get_value_from_json(j, "factor"); - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 mask_tensor = Tensor2::stretch_from(input_output_info.inputs[1]); - Tensors2 in_tensors; - in_tensors.push_back(in_tensor); - in_tensors.push_back(mask_tensor); - Tensor2 out_tensor; - blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - emplaceback_layer(new MaskedSoftmaxLayer(in_tensors, out_tensor, scale_factor, - blobs_buff, gpu_resource)); - } - } - break; - } - case Layer_t::PReLU_Dice: { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - blobs_buff->reserve(in_tensor.get_dimensions(), &out_tensor); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - // get PReLU_Dice params - auto j_prelu_dice_param = get_json(j, "prelu_dice_param"); - auto alpha = get_value_from_json(j_prelu_dice_param, "alpha"); - auto epsilon = get_value_from_json(j_prelu_dice_param, "eps"); - emplaceback_layer(new PRelu_Dice_Layer(in_tensor, out_tensor, blobs_buff, alpha, - epsilon, gpu_resource)); - break; - } - case Layer_t::Scale: { - Tensor2 scale_in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 scale_out_tensor; - // get Scale params - auto j_scale_param = get_json(j, "scale_param"); - auto axis = get_value_from_json(j_scale_param, "axis"); - auto factor = get_value_from_json(j_scale_param, "factor"); - emplaceback_layer(new ScaleLayer(scale_in_tensor, scale_out_tensor, blobs_buff, axis, - factor, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], scale_out_tensor.shrink()}); - break; - } - case Layer_t::FusedReshapeConcat: { - Tensors2 in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2::stretch_from(bag)); - } - Tensors2 out_tensors; - emplaceback_layer( - new FusedReshapeConcatLayer(in_tensors, out_tensors, blobs_buff, gpu_resource)); - for (size_t i = 0; i < out_tensors.size(); i++) { - output_tensor_entries.push_back( - {input_output_info.output_names[i], out_tensors[i].shrink()}); - } - break; - } - case Layer_t::FusedReshapeConcatGeneral: { - Tensors2 in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2::stretch_from(bag)); - } - Tensor2 out_tensor; - emplaceback_layer(new FusedReshapeConcatGeneralLayer(in_tensors, out_tensor, - blobs_buff, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - break; - } - case Layer_t::Reshape: { - auto selected_it = j.find("selected"); - // selective reshape - if (selected_it != j.end()) { - std::vector selected; - nlohmann::json j_selected = (selected_it.value()); - for (auto slot_obj : j_selected) { - int slot_id = slot_obj.get(); - if (slot_id < 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "slot_id < 0"); - } - selected.push_back(slot_id); - } - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - emplaceback_layer(new ReshapeLayer<__half>(in_tensor, out_tensor, blobs_buff, selected, - gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - emplaceback_layer( - new ReshapeLayer(in_tensor, out_tensor, blobs_buff, selected, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - } - } - // general purpose reshape - else { - auto leading_dim_it = j.find("leading_dim"); - auto j_time_step = j.find("time_step"); - // if leading_dim is not specified, default leading_dim = n_slots * vector_length - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - const auto& in_dims = in_tensor.get_dimensions(); - size_t leading_dim = (leading_dim_it != j.end()) - ? (*leading_dim_it).get() - : in_tensor.get_num_elements() / in_dims[0]; - size_t time_step = (j_time_step != j.end()) ? (*j_time_step).get() : 0; - if (time_step == 0) { // 2D output - blobs_buff->reserve({in_tensor.get_num_elements() / leading_dim, leading_dim}, - &out_tensor); - } else { // 3D output - size_t batch_size = in_tensor.get_num_elements() / leading_dim / time_step; - blobs_buff->reserve({batch_size, time_step, leading_dim}, &out_tensor); - } - emplaceback_layer( - new ReshapeLayer<__half>(in_tensor, out_tensor, blobs_buff, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - const auto& in_dims = in_tensor.get_dimensions(); - size_t leading_dim = (leading_dim_it != j.end()) - ? (*leading_dim_it).get() - : in_tensor.get_num_elements() / in_dims[0]; - size_t time_step = (j_time_step != j.end()) ? (*j_time_step).get() : 0; - if (time_step == 0) { // 2D output - blobs_buff->reserve({in_tensor.get_num_elements() / leading_dim, leading_dim}, - &out_tensor); - } else { // 3D output - size_t batch_size = in_tensor.get_num_elements() / leading_dim / time_step; - blobs_buff->reserve({batch_size, time_step, leading_dim}, &out_tensor); - } - emplaceback_layer( - new ReshapeLayer(in_tensor, out_tensor, blobs_buff, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], out_tensor.shrink()}); - } - } - break; - } - case Layer_t::Sigmoid: { - if (use_mixed_precision) { - Tensor2<__half> sigmoid_in_tensor = - Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> sigmoid_out_tensor; - blobs_buff->reserve(sigmoid_in_tensor.get_dimensions(), &sigmoid_out_tensor); - emplaceback_layer( - new SigmoidLayer<__half>(sigmoid_in_tensor, sigmoid_out_tensor, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - } else { - // establish out tensor - Tensor2 sigmoid_in_tensor = - Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 sigmoid_out_tensor; - blobs_buff->reserve(sigmoid_in_tensor.get_dimensions(), &sigmoid_out_tensor); - emplaceback_layer( - new SigmoidLayer(sigmoid_in_tensor, sigmoid_out_tensor, gpu_resource)); - output_tensor_entries.push_back( - {input_output_info.output_names[0], sigmoid_out_tensor.shrink()}); - } - break; - } - case Layer_t::Slice: { - std::vector> ranges; - auto j_ranges = get_json(j, "ranges"); - assert(j_ranges.is_array()); - for (auto j_range : j_ranges) { - assert(j_range.is_array()); - ranges.emplace_back(std::make_pair(j_range[0].get(), j_range[1].get())); - } - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensors2<__half> out_tensors; - emplaceback_layer( - new SliceLayer<__half>(in_tensor, out_tensors, blobs_buff, ranges, gpu_resource)); - for (size_t i = 0; i < out_tensors.size(); i++) { - output_tensor_entries.push_back( - {input_output_info.output_names[i], out_tensors[i].shrink()}); - } - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensors2 out_tensors; - emplaceback_layer( - new SliceLayer(in_tensor, out_tensors, blobs_buff, ranges, gpu_resource)); - for (size_t i = 0; i < out_tensors.size(); i++) { - output_tensor_entries.push_back( - {input_output_info.output_names[i], out_tensors[i].shrink()}); - } - } - break; - } - case Layer_t::WeightMultiply: { - std::vector weight_dims; - auto dims = get_json(j, "weight_dims"); - assert(dims.is_array()); - for (auto dim : dims) { - weight_dims.emplace_back(dim.get()); - } - - // establish initializer - std::vector initializer_types(1, Initializer_t::Default); - if (has_key_(j, "weight_init")) { - const auto weight_init_name = get_value_from_json(j, "weight_init"); - Initializer_t weight_init_type; - if (!find_item_in_map(weight_init_type, weight_init_name, INITIALIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such initializer: " + weight_init_name); - } else { - initializer_types[0] = weight_init_type; - } - } - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - emplaceback_layer(new WeightMultiplyLayer<__half>( - weight_buff, weight_buff_half, wgrad_buff_half, blobs_buff, in_tensor, out_tensor, - weight_dims, gpu_resource, initializer_types)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - emplaceback_layer(new WeightMultiplyLayer( - weight_buff, weight_buff, wgrad_buff, blobs_buff, in_tensor, out_tensor, weight_dims, - gpu_resource, initializer_types)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - case Layer_t::FmOrder2: { - auto out_dim = get_json(j, "out_dim").get(); - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - blobs_buff->reserve({in_tensor.get_dimensions()[0], out_dim}, &out_tensor); - - emplaceback_layer(new FmOrder2Layer<__half>(in_tensor, out_tensor, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - blobs_buff->reserve({in_tensor.get_dimensions()[0], out_dim}, &out_tensor); - - emplaceback_layer(new FmOrder2Layer(in_tensor, out_tensor, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - case Layer_t::Add: { - if (use_mixed_precision) { - Tensors2<__half> in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2<__half>::stretch_from(bag)); - } - Tensor2<__half> out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - emplaceback_layer(new AddLayer<__half>(in_tensors, out_tensor, blobs_buff, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensors2 in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2::stretch_from(bag)); - } - Tensor2 out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - emplaceback_layer(new AddLayer(in_tensors, out_tensor, blobs_buff, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - case Layer_t::ReduceSum: { - int axis = get_json(j, "axis").get(); - - if (use_mixed_precision) { - Tensor2<__half> in_tensor = Tensor2<__half>::stretch_from(input_output_info.inputs[0]); - Tensor2<__half> out_tensor; - emplaceback_layer( - new ReduceSumLayer<__half>(in_tensor, out_tensor, blobs_buff, axis, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensor2 in_tensor = Tensor2::stretch_from(input_output_info.inputs[0]); - Tensor2 out_tensor; - emplaceback_layer( - new ReduceSumLayer(in_tensor, out_tensor, blobs_buff, axis, gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - case Layer_t::ElementwiseMultiply: { - if (use_mixed_precision) { - Tensors2<__half> in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2<__half>::stretch_from(bag)); - } - Tensor2<__half> out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - emplaceback_layer(new ElementwiseMultiplyLayer<__half>(in_tensors, out_tensor, blobs_buff, - gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } else { - Tensors2 in_tensors; - for (const auto& bag : input_output_info.inputs) { - in_tensors.push_back(Tensor2::stretch_from(bag)); - } - Tensor2 out_tensor; - blobs_buff->reserve(in_tensors[0].get_dimensions(), &out_tensor); - emplaceback_layer(new ElementwiseMultiplyLayer(in_tensors, out_tensor, blobs_buff, - gpu_resource)); - output_tensor_entries.push_back({input_output_info.output_names[0], out_tensor.shrink()}); - } - break; - } - default: - assert(!"Error: no such layer && should never get here!"); - } // end of switch - if (!inference_flag && - (layer_type == Layer_t::CrossEntropyLoss || layer_type == Layer_t::BinaryCrossEntropyLoss || - layer_type == Layer_t::MultiCrossEntropyLoss)) { - if (raw_metrics) { - std::string name = get_layer_names(bottom)[1]; - Tensor2 lookup_loss_tensor = loss_tensors.find(name)->second; - - metrics::RawMetricMap new_map; - new_map.insert(std::make_pair(metrics::RawType::Loss, lookup_loss_tensor.shrink())); - new_map.insert(std::make_pair(metrics::RawType::Pred, input_output_info.inputs[0])); - new_map.insert(std::make_pair(metrics::RawType::Label, input_output_info.inputs[1])); - } - } else { - for (auto& output_tensor_entry : output_tensor_entries) { - tensor_entries.push_back(output_tensor_entry); - } - } - - skip_dgrad = false; - } // for layers - - for (const auto& entry : multi_task_output_tensor_entries) { - if (entry.bag.get_dimensions().size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "the prediction tensor for each task must be two dim"); - } - } - - if (inference_flag && multi_task_output_tensor_entries.size() > 1) { - if (use_mixed_precision) { - Tensors2<__half> in_tensors; - for (const auto& entry : multi_task_output_tensor_entries) { - in_tensors.push_back(Tensor2<__half>::stretch_from(entry.bag)); - } - Tensor2<__half> out_tensor; - emplaceback_layer(new ConcatLayer<__half>(in_tensors, out_tensor, blobs_buff, gpu_resource)); - tensor_entries.push_back({"multi_task_combined_pred", out_tensor.shrink()}); - } else { - Tensors2 in_tensors; - for (const auto& entry : multi_task_output_tensor_entries) { - in_tensors.push_back(Tensor2::stretch_from(entry.bag)); - } - Tensor2 out_tensor; - emplaceback_layer(new ConcatLayer(in_tensors, out_tensor, blobs_buff, gpu_resource)); - tensor_entries.push_back({"multi_task_combined_pred", out_tensor.shrink()}); - } - } -} - -/* - * Create single network - * - */ -Network* Network::create_network(const nlohmann::json& j_array, const nlohmann::json& j_optimizer, - std::vector& train_tensor_entries, - std::vector& evaluate_tensor_entries, - int num_networks_in_global, - std::shared_ptr& exchange_wgrad, - const std::shared_ptr& cpu_resource, - const std::shared_ptr& gpu_resource, - bool use_mixed_precision, bool enable_tf32_compute, float scaler, - bool use_algorithm_search, bool inference_flag, - bool grouped_all_reduce) { - Network* network = new Network(cpu_resource, gpu_resource, use_mixed_precision); - - auto& train_layers = network->train_layers_; - auto* bottom_layers = &network->bottom_layers_; - auto* top_layers = &network->top_layers_; - auto& evaluate_layers = network->evaluate_layers_; - auto& train_loss_tensors = network->train_loss_tensors_; - auto& evaluate_loss_tensors = network->evaluate_loss_tensors_; - auto& train_losses = network->train_losses_; - auto& evaluate_losses = network->evaluate_losses_; - auto& raw_metrics = network->raw_metrics_; - - std::shared_ptr> blobs_buff = - GeneralBuffer2::create(); - - std::shared_ptr> train_weight_buff = blobs_buff->create_block(); - std::shared_ptr> train_weight_buff_half = blobs_buff->create_block<__half>(); - std::shared_ptr> wgrad_buff = nullptr; - std::shared_ptr> wgrad_buff_half = nullptr; - - if (!inference_flag) { - if (use_mixed_precision) { - auto id = gpu_resource->get_local_id(); - wgrad_buff_half = - (grouped_all_reduce) - ? std::dynamic_pointer_cast>(exchange_wgrad) - ->get_network_wgrad_buffs()[id] - : std::dynamic_pointer_cast>(exchange_wgrad) - ->get_network_wgrad_buffs()[id]; - wgrad_buff = blobs_buff->create_block(); // placeholder - } else { - auto id = gpu_resource->get_local_id(); - wgrad_buff = (grouped_all_reduce) - ? std::dynamic_pointer_cast>(exchange_wgrad) - ->get_network_wgrad_buffs()[id] - : std::dynamic_pointer_cast>(exchange_wgrad) - ->get_network_wgrad_buffs()[id]; - wgrad_buff_half = blobs_buff->create_block<__half>(); // placeholder - } - } else { - wgrad_buff = blobs_buff->create_block(); - wgrad_buff_half = blobs_buff->create_block<__half>(); - } - - std::shared_ptr> evaluate_weight_buff = blobs_buff->create_block(); - std::shared_ptr> evaluate_weight_buff_half = - blobs_buff->create_block<__half>(); - std::shared_ptr> wgrad_buff_placeholder = blobs_buff->create_block(); - std::shared_ptr> wgrad_buff_half_placeholder = - blobs_buff->create_block<__half>(); - - std::shared_ptr> opt_buff = blobs_buff->create_block(); - std::shared_ptr> opt_buff_half = blobs_buff->create_block<__half>(); - - // TODO: implement multiple loss support in create_layers - if (!inference_flag) { - // create train layers - create_layers(j_array, train_tensor_entries, blobs_buff, train_weight_buff, - train_weight_buff_half, wgrad_buff, wgrad_buff_half, train_loss_tensors, - gpu_resource, use_mixed_precision, enable_tf32_compute, num_networks_in_global, - scaler, inference_flag, train_layers, train_losses, nullptr, top_layers, - bottom_layers); - } - - // create evaluate layers - create_layers(j_array, evaluate_tensor_entries, blobs_buff, evaluate_weight_buff, - evaluate_weight_buff_half, wgrad_buff_placeholder, wgrad_buff_half_placeholder, - evaluate_loss_tensors, gpu_resource, use_mixed_precision, enable_tf32_compute, - num_networks_in_global, scaler, inference_flag, evaluate_layers, evaluate_losses, - &raw_metrics); - - // create optimizer - if (!inference_flag) { - if (use_mixed_precision) { - auto opt_param = get_optimizer_param(j_optimizer); - - network->optimizer_ = std::move(Optimizer::Create( - opt_param, train_weight_buff->as_tensor(), train_weight_buff_half->as_tensor(), - wgrad_buff_half->as_tensor(), scaler, opt_buff, gpu_resource, use_mixed_precision)); - } else { - auto opt_param = get_optimizer_param(j_optimizer); - - network->optimizer_ = std::move(Optimizer::Create( - opt_param, train_weight_buff->as_tensor(), train_weight_buff_half->as_tensor(), - wgrad_buff->as_tensor(), scaler, opt_buff, gpu_resource, use_mixed_precision)); - } - } else { - try { - TensorEntry pred_tensor_entry = evaluate_tensor_entries.back(); - if (use_mixed_precision) { - network->pred_tensor_half_ = Tensor2<__half>::stretch_from(pred_tensor_entry.bag); - } else { - network->pred_tensor_ = Tensor2::stretch_from(pred_tensor_entry.bag); - } - } catch (const std::runtime_error& rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } - } - - network->train_weight_tensor_ = train_weight_buff->as_tensor(); - network->train_weight_tensor_half_ = train_weight_buff_half->as_tensor(); - network->wgrad_tensor_ = wgrad_buff->as_tensor(); - network->wgrad_tensor_half_ = wgrad_buff_half->as_tensor(); - network->evaluate_weight_tensor_ = evaluate_weight_buff->as_tensor(); - network->evaluate_weight_tensor_half_ = evaluate_weight_buff_half->as_tensor(); - network->opt_tensor_ = opt_buff->as_tensor(); - network->opt_tensor_half_ = opt_buff_half->as_tensor(); - - CudaDeviceContext context(gpu_resource->get_device_id()); - blobs_buff->allocate(); - - return network; -} - -} // namespace HugeCTR diff --git a/HugeCTR/src/parsers/create_optimizer.cpp b/HugeCTR/src/parsers/create_optimizer.cpp deleted file mode 100644 index 5d588377a4..0000000000 --- a/HugeCTR/src/parsers/create_optimizer.cpp +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -#ifdef ENABLE_MPI -#include -#endif - -namespace HugeCTR { - -OptParams get_optimizer_param(const nlohmann::json& j_optimizer) { - // create optimizer - auto optimizer_name = get_value_from_json(j_optimizer, "type"); - Optimizer_t optimizer_type = Optimizer_t::DEFAULT; - if (!find_item_in_map(optimizer_type, optimizer_name, OPTIMIZER_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such optimizer: " + optimizer_name); - } - - OptHyperParams opt_hyper_params; - OptParams opt_params; - - Update_t update_type = Update_t::Local; - if (has_key_(j_optimizer, "update_type")) { - std::string update_name = get_value_from_json(j_optimizer, "update_type"); - if (!find_item_in_map(update_type, update_name, UPDATE_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such update type: " + update_name); - } - } else if (has_key_(j_optimizer, "global_update")) { - bool global_update = get_value_from_json(j_optimizer, "global_update"); - if (global_update) update_type = Update_t::Global; - } else { - HCTR_LOG(INFO, ROOT, "update_type is not specified, using default: local\n"); - } - - switch (optimizer_type) { - case Optimizer_t::Ftrl: { - auto j_hparam = get_json(j_optimizer, "ftrl_hparam"); - const float learning_rate = get_value_from_json(j_hparam, "learning_rate"); - const float beta = get_value_from_json(j_hparam, "beta"); - const float lambda1 = get_value_from_json(j_hparam, "lambda1"); - const float lambda2 = get_value_from_json(j_hparam, "lambda2"); - opt_hyper_params.ftrl.beta = beta; - opt_hyper_params.ftrl.lambda1 = lambda1; - opt_hyper_params.ftrl.lambda2 = lambda2; - opt_params = {Optimizer_t::Ftrl, learning_rate, opt_hyper_params, update_type}; - } break; - - case Optimizer_t::Adam: { - auto j_hparam = get_json(j_optimizer, "adam_hparam"); - float learning_rate = get_value_from_json(j_hparam, "learning_rate"); - float beta1 = get_value_from_json(j_hparam, "beta1"); - float beta2 = get_value_from_json(j_hparam, "beta2"); - float epsilon = get_value_from_json(j_hparam, "epsilon"); - opt_hyper_params.adam.beta1 = beta1; - opt_hyper_params.adam.beta2 = beta2; - opt_hyper_params.adam.epsilon = epsilon; - opt_params = {Optimizer_t::Adam, learning_rate, opt_hyper_params, update_type}; - } break; - - case Optimizer_t::AdaGrad: { - auto j_hparam = get_json(j_optimizer, "adagrad_hparam"); - float learning_rate = get_value_from_json(j_hparam, "learning_rate"); - float initial_accu_value = get_value_from_json(j_hparam, "initial_accu_value"); - float epsilon = get_value_from_json(j_hparam, "epsilon"); - opt_hyper_params.adagrad.initial_accu_value = initial_accu_value; - opt_hyper_params.adagrad.epsilon = epsilon; - opt_params = {Optimizer_t::AdaGrad, learning_rate, opt_hyper_params, update_type}; - } break; - - case Optimizer_t::MomentumSGD: { - auto j_hparam = get_json(j_optimizer, "momentum_sgd_hparam"); - float learning_rate = get_value_from_json(j_hparam, "learning_rate"); - float momentum_factor = get_value_from_json(j_hparam, "momentum_factor"); - opt_hyper_params.momentum.factor = momentum_factor; - opt_params = {Optimizer_t::MomentumSGD, learning_rate, opt_hyper_params, update_type}; - } break; - - case Optimizer_t::Nesterov: { - auto j_hparam = get_json(j_optimizer, "nesterov_hparam"); - float learning_rate = get_value_from_json(j_hparam, "learning_rate"); - float momentum_factor = get_value_from_json(j_hparam, "momentum_factor"); - opt_hyper_params.nesterov.mu = momentum_factor; - opt_params = {Optimizer_t::Nesterov, learning_rate, opt_hyper_params, update_type}; - } break; - - case Optimizer_t::SGD: { - auto j_hparam = get_json(j_optimizer, "sgd_hparam"); - auto learning_rate = get_value_from_json(j_hparam, "learning_rate"); - if (has_key_(j_hparam, "atomic_update")) { - opt_hyper_params.sgd.atomic_update = get_value_from_json(j_hparam, "atomic_update"); - } - opt_params = {Optimizer_t::SGD, learning_rate, opt_hyper_params, update_type}; - } break; - - default: - assert(!"Error: no such optimizer && should never get here!"); - } - return opt_params; -} - -} // namespace HugeCTR diff --git a/HugeCTR/src/parsers/inference_parser.cpp b/HugeCTR/src/parsers/inference_parser.cpp deleted file mode 100644 index 897b98ab89..0000000000 --- a/HugeCTR/src/parsers/inference_parser.cpp +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -namespace HugeCTR { - -core23::BufferChannel GetInferenceBufferChannel() { - static auto name = core23::GetRandomBufferChannelName(); - return core23::BufferChannel(name); -} - -//***** create_pipeline_inference with new tensor -template -void InferenceParser::create_pipeline_inference( - const InferenceParams& inference_params, core23::Tensor& dense_input_bag, - std::vector>& rows, - std::vector>& embeddingvecs, - std::vector& embedding_table_slot_size, std::vector>* embeddings, - Network** network, std::vector& inference_tensor_entries, - const std::shared_ptr resource_manager) { - // Not used, required as an argument by Network::create_network - core23::Device device_gpu(core23::DeviceType::GPU, inference_params.device_id); - core23::TensorParams tensor_params = core23::TensorParams() - .device(device_gpu) - .buffer_channel(HugeCTR::GetInferenceBufferChannel()); - std::vector train_tensor_entries; - auto j_layers_array = get_json(config_, "layers"); - check_graph(tensor_active_, j_layers_array); - auto input_buffer = GeneralBuffer2::create(); - { - const nlohmann::json& j_data = j_layers_array[0]; - auto j_dense = get_json(j_data, "dense"); - - auto top_strs_dense = get_value_from_json(j_dense, "top"); - auto dense_dim = get_value_from_json(j_dense, "dense_dim"); - core23::Tensor dense_input( - tensor_params.shape({(int64_t)inference_params.max_batchsize, (int64_t)dense_dim}) - .buffer_channel(core23::GetRandomBufferChannel()) - .data_type(core23::ToScalarType::value)); - // input_buffer->reserve({inference_params.max_batchsize, dense_dim}, &dense_input); - inference_tensor_entries.push_back( - {top_strs_dense, - core_helper::convert_core23_tensor_to_tensorbag2(dense_input)}); - dense_input_bag = dense_input; - - auto j_label = get_json(j_data, "label"); - auto label_name_arr = get_json(j_label, "top"); - auto label_dim_arr = get_json(j_label, "label_dim"); - std::string top_strs_label; - size_t label_dim; - if (label_name_arr.is_array()) { - for (int i = 0; i < label_dim_arr.size(); ++i) { - label_dim = label_dim_arr[i].get(); - top_strs_label = label_name_arr[i].get(); - core23::Tensor label_input( - tensor_params.shape({(int64_t)inference_params.max_batchsize, (int64_t)label_dim}) - .data_type(core23::ToScalarType::value)); - inference_tensor_entries.push_back( - {top_strs_label, core_helper::convert_core23_tensor_to_tensorbag2(label_input)}); - } - } else { - top_strs_label = get_value_from_json(j_label, "top"); - label_dim = get_value_from_json(j_label, "label_dim"); - core23::Tensor label_input( - tensor_params.shape({(int64_t)inference_params.max_batchsize, (int64_t)label_dim}) - .data_type(core23::ToScalarType::value)); - inference_tensor_entries.push_back( - {top_strs_label, core_helper::convert_core23_tensor_to_tensorbag2(label_input)}); - } - } - create_embedding()( - inference_params, j_layers_array, rows, embeddingvecs, embedding_table_slot_size, - &inference_tensor_entries, embeddings, - resource_manager->get_local_gpu_from_device_id(inference_params.device_id), input_buffer); - - CudaDeviceContext context( - resource_manager->get_local_gpu_from_device_id(inference_params.device_id)->get_device_id()); - input_buffer->allocate(); - // TODO: perhaps it is better to make a wrapper of this function for the inference - // rather than passing unused parameters here. - std::shared_ptr exchange_wgrad_dummy; - *network = Network::create_network( - j_layers_array, "", train_tensor_entries, inference_tensor_entries, 1, exchange_wgrad_dummy, - resource_manager->get_local_cpu(), - resource_manager->get_local_gpu_from_device_id(inference_params.device_id), - inference_params.use_mixed_precision, false, inference_params.scaler, false, true, false); -} - -//****create_pipeline with new tensor -void InferenceParser::create_pipeline(const InferenceParams& inference_params, - core23::Tensor& dense_input_bag, - std::vector>& rows, - std::vector>& embeddingvecs, - std::vector& embedding_table_slot_size, - std::vector>* embeddings, - Network** network, - std::vector& inference_tensor_entries, - const std::shared_ptr resource_manager) { - if (inference_params.use_mixed_precision) { - create_pipeline_inference<__half>(inference_params, dense_input_bag, rows, embeddingvecs, - embedding_table_slot_size, embeddings, network, - inference_tensor_entries, resource_manager); - } else { - create_pipeline_inference(inference_params, dense_input_bag, rows, embeddingvecs, - embedding_table_slot_size, embeddings, network, - inference_tensor_entries, resource_manager); - } -} - -InferenceParser::InferenceParser(const nlohmann::json& config) : config_(config) { - auto j_layers_array = get_json(config, "layers"); - const nlohmann::json& j_data = j_layers_array[0]; - auto j_label_data = get_json(j_data, "label"); - auto j_dense_data = get_json(j_data, "dense"); - auto j_sparse_data = get_json(j_data, "sparse"); - dense_name = get_value_from_json(j_dense_data, "top"); - dense_dim = get_value_from_json(j_dense_data, "dense_dim"); - - auto label_name_arr = get_json(j_label_data, "top"); - auto label_dim_arr = get_json(j_label_data, "label_dim"); - if (label_name_arr.is_array()) { - label_name = "combined_multi_label"; - label_dim = 0; - for (int i = 0; i < label_dim_arr.size(); ++i) { - label_dim += label_dim_arr[i].get(); - } - } else { - label_name = get_value_from_json(j_label_data, "top"); - label_dim = get_value_from_json(j_label_data, "label_dim"); - } - - num_embedding_tables = j_sparse_data.size(); - slot_num = 0; - for (size_t i = 0; i < num_embedding_tables; i++) { - const nlohmann::json& j = j_sparse_data[i]; - const size_t max_feature_num_per_sample_per_table = - get_max_feature_num_per_sample_from_nnz_per_slot(j); - auto current_slot_num = get_value_from_json(j, "slot_num"); - int current_max_nnz = get_max_nnz_from_nnz_per_slot(j); - auto sparse_name = get_value_from_json(j, "top"); - max_feature_num_for_tables.push_back(max_feature_num_per_sample_per_table); - slot_num_for_tables.push_back(current_slot_num); - max_nnz_for_tables.push_back(current_max_nnz); - sparse_names.push_back(sparse_name); - slot_num += current_slot_num; - } - - // get embedding params - for (size_t i = 1; i < j_layers_array.size(); i++) { - // if not embedding then break - const nlohmann::json& j = j_layers_array[i]; - auto embedding_name = get_value_from_json(j, "type"); - if (embedding_name.compare("DistributedSlotSparseEmbeddingHash") != 0 && - embedding_name.compare("LocalizedSlotSparseEmbeddingHash") != 0 && - embedding_name.compare("LocalizedSlotSparseEmbeddingOneHot") != 0) { - break; - } - auto j_embed_params = get_json(j, "sparse_embedding_hparam"); - auto embedding_vec_size = get_value_from_json(j_embed_params, "embedding_vec_size"); - embed_vec_size_for_tables.push_back(embedding_vec_size); - } - - max_embedding_vector_size_per_sample = 0; - max_feature_num_per_sample = 0; - for (size_t i = 0; i < num_embedding_tables; i++) { - max_embedding_vector_size_per_sample += - (max_feature_num_for_tables[i] * embed_vec_size_for_tables[i]); - max_feature_num_per_sample += max_feature_num_for_tables[i]; - } -} - -//******** create_embedding with new tensor -template -void create_embedding::operator()( - const InferenceParams& inference_params, const nlohmann::json& j_layers_array, - std::vector>& rows, - std::vector>& embeddingvecs, - std::vector& embedding_table_slot_size, std::vector* tensor_entries, - std::vector>* embeddings, - const std::shared_ptr gpu_resource, - std::shared_ptr>& blobs_buff) { - HCTR_LOG(INFO, ROOT, "start create embedding for inference\n"); - auto j_data = j_layers_array[0]; - if (!has_key_(j_data, "sparse")) { - HCTR_LOG(INFO, ROOT, "no sparse data input\n"); - return; - } - - auto j_sparse_input = get_json(j_data, "sparse"); - std::unordered_map> slot_nums_map; - for (unsigned int i = 0; i < j_sparse_input.size(); ++i) { - auto top = get_value_from_json(j_sparse_input[i], "top"); - auto slot_num = get_value_from_json(j_sparse_input[i], "slot_num"); - int max_feature_num_per_sample = - get_max_feature_num_per_sample_from_nnz_per_slot(j_sparse_input[i]); - HCTR_LOG_S(INFO, ROOT) << "sparse_input name " << top << std::endl; - slot_nums_map[top] = std::make_pair(slot_num, max_feature_num_per_sample); - } - if (j_layers_array.size() < 1) { - HCTR_OWN_THROW(Error_t::WrongInput, "layer not defined in config"); - } - for (unsigned int i = 1; i < j_layers_array.size(); i++) { - const nlohmann::json& j = j_layers_array[i]; - auto bottom_array = get_json(j, "bottom"); - if (bottom_array.is_array()) { - continue; - } - std::string bottom = bottom_array.get(); - ; - auto slot_nums_map_iter = slot_nums_map.find(bottom); - if (slot_nums_map_iter == slot_nums_map.end()) { - continue; - } - const std::string layer_top = get_value_from_json(j, "top"); - int slot_num = slot_nums_map_iter->second.first; - int max_feature_num_per_sample = slot_nums_map_iter->second.second; - auto j_hparam = get_json(j, "sparse_embedding_hparam"); - auto combiner_str = get_value_from_json(j_hparam, "combiner"); - EmbeddingFeatureCombiner_t feature_combiner_type; - if (combiner_str == "sum") { - feature_combiner_type = EmbeddingFeatureCombiner_t::Sum; - } else if (combiner_str == "mean") { - feature_combiner_type = EmbeddingFeatureCombiner_t::Mean; - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "combiner need to be sum or mean"); - } - size_t embedding_vec_size = get_value_from_json(j_hparam, "embedding_vec_size"); - - size_t prefix_slot_num = embedding_table_slot_size.back(); - embedding_table_slot_size.push_back(prefix_slot_num + slot_num); - - core23::Device device_gpu(core23::DeviceType::GPU, inference_params.device_id); - core23::TensorParams tensor_params = core23::TensorParams() - .device(device_gpu) - .buffer_channel(HugeCTR::GetInferenceBufferChannel()); - std::shared_ptr row_tensor_new = std::make_shared( - tensor_params.shape({static_cast(inference_params.max_batchsize * slot_num + 1)}) - .data_type(core23::ScalarType::Int32)); - std::shared_ptr embeddingvecs_tensor_new = std::make_shared( - tensor_params - .shape( - {static_cast(inference_params.max_batchsize * max_feature_num_per_sample), - static_cast(embedding_vec_size)}) - .data_type(core23::ScalarType::Float)); - rows.push_back(row_tensor_new); - embeddingvecs.push_back(embeddingvecs_tensor_new); - Tensor2 embedding_output; - embeddings->push_back(std::make_shared>( - embeddingvecs.back(), rows.back(), embedding_output, inference_params.max_batchsize, - slot_num, feature_combiner_type, blobs_buff, gpu_resource)); - tensor_entries->push_back({layer_top, embedding_output.shrink()}); - } - - HCTR_LOG(INFO, ROOT, "create embedding for inference success\n"); -} - -template struct create_embedding; -template struct create_embedding; -template struct create_embedding; -template struct create_embedding; -} // namespace HugeCTR diff --git a/HugeCTR/src/pybind/inference_model.cpp b/HugeCTR/src/pybind/inference_model.cpp deleted file mode 100644 index a771c9112a..0000000000 --- a/HugeCTR/src/pybind/inference_model.cpp +++ /dev/null @@ -1,388 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include - -namespace HugeCTR { - -core23::BufferChannel GetInferenceModelBufferChannel() { - static auto name = core23::GetRandomBufferChannelName(); - return core23::BufferChannel(name); -} - -InferenceModel::InferenceModel(const std::string& model_config_path, - const InferenceParams& inference_params) - : inference_params_(inference_params), - inference_parser_(read_json_file(model_config_path)), - resource_manager_(ResourceManagerCore::create({inference_params.deployed_devices}, 0)), - global_max_batch_size_(inference_params_.max_batchsize) { - HCTR_CHECK_HINT(resource_manager_->get_local_gpu_count() > 0, "deployed_devices cannot be empty"); - HCTR_CHECK_HINT(global_max_batch_size_ % resource_manager_->get_local_gpu_count() == 0, - "max_batchsize should be divisible by the number of deployed_devices"); - inference_params_.max_batchsize = - global_max_batch_size_ / resource_manager_->get_local_gpu_count(); - std::vector model_config_path_array{model_config_path}; - std::vector inference_params_array{inference_params_}; - parameter_server_config ps_config{model_config_path_array, inference_params_array}; - parameter_server_ = HierParameterServerBase::create(ps_config); - - for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { - inference_params_.device_id = resource_manager_->get_local_gpu(i)->get_device_id(); - CudaDeviceContext context(inference_params_.device_id); - auto embedding_cache = parameter_server_->get_embedding_cache(inference_params_.model_name, - inference_params_.device_id); - inference_sessions_.emplace_back(new InferenceSession(model_config_path, inference_params_, - embedding_cache, resource_manager_)); - inference_tensor_entries_list_.push_back( - inference_sessions_.back()->get_inference_tensor_entries()); - } - - inference_params_.max_batchsize = global_max_batch_size_; - - size_t batch_size_per_gpu = global_max_batch_size_ / resource_manager_->get_local_gpu_count(); - old_pred_tensor_list_.resize(resource_manager_->get_local_gpu_count()); - for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); ++i) { - core23::Device gpu(core23::DeviceType::GPU, - resource_manager_->get_local_gpu(i)->get_device_id()); - core23::TensorParams tensor_params = core23::TensorParams().device(gpu).buffer_channel( - HugeCTR::GetInferenceModelBufferChannel()); - pred_tensor_list_.emplace_back(tensor_params - .shape({static_cast(batch_size_per_gpu), - static_cast(inference_parser_.label_dim)}) - .data_type(core23::ScalarType::Float)); - rowoffset_tensor_list_.emplace_back( - tensor_params - .shape({static_cast(batch_size_per_gpu * inference_parser_.slot_num + - inference_parser_.num_embedding_tables)}) - .data_type(core23::ScalarType::Int32)); - if (inference_params_.i64_input_key) { - key_tensor_list_.emplace_back( - tensor_params - .shape({static_cast(batch_size_per_gpu * - inference_parser_.max_feature_num_per_sample)}) - .data_type(core23::ScalarType::Int64)); - } else { - key_tensor_list_.emplace_back( - tensor_params - .shape({static_cast(batch_size_per_gpu * - inference_parser_.max_feature_num_per_sample)}) - .data_type(core23::ScalarType::UInt32)); - } - } -} - -InferenceModel::~InferenceModel() { - for (auto device : resource_manager_->get_local_gpu_device_id_list()) { - CudaDeviceContext context(device); - HCTR_LIB_CHECK_(cudaDeviceSynchronize()); - } -} - -void InferenceModel::reset_reader_tensor_list() { - reader_label_tensor_list_.clear(); - reader_dense_tensor_list_.clear(); - sparse_input_map_32_.clear(); - sparse_input_map_64_.clear(); -} - -void InferenceModel::predict(float* pred_output, const size_t num_batches, - const std::string& source, const DataReaderType_t data_reader_type, - const Check_t check_type, - const std::vector& slot_size_array, - const DataSourceParams& data_source_params, bool reading_file_seq) { - reset_reader_tensor_list(); - if (inference_params_.i64_input_key) { - create_datareader()( - inference_params_, inference_parser_, data_reader_, resource_manager_, sparse_input_map_64_, - reader_label_tensor_list_, reader_dense_tensor_list_, source, data_reader_type, check_type, - slot_size_array, true, data_source_params, reading_file_seq); // repeat dataset - } else { - create_datareader()( - inference_params_, inference_parser_, data_reader_, resource_manager_, sparse_input_map_32_, - reader_label_tensor_list_, reader_dense_tensor_list_, source, data_reader_type, check_type, - slot_size_array, true, data_source_params, reading_file_seq); // repeat dataset - } - tqdm bar; - timer_infer.start(); - for (size_t batch = 0; batch < num_batches; batch++) { - current_batch_size_ = data_reader_->read_a_batch_to_device(); - HCTR_CHECK_HINT(current_batch_size_ == global_max_batch_size_, - "there should not be incomplete batch under the repeat mode"); - if (inference_params_.i64_input_key) { - parse_input_from_data_reader(sparse_input_map_64_, key_tensor_list_, - rowoffset_tensor_list_); - } else { - parse_input_from_data_reader(sparse_input_map_32_, key_tensor_list_, - rowoffset_tensor_list_); - } - -#pragma omp parallel num_threads(resource_manager_->get_local_gpu_count()) - { - size_t i = omp_get_thread_num(); - CudaDeviceContext context(resource_manager_->get_local_gpu(i)->get_device_id()); - long long current_batchsize_per_device = - current_batch_size_ / resource_manager_->get_local_gpu_count(); - if (inference_params_.i64_input_key) { - inference_sessions_[i]->predict_from_device( - reader_dense_tensor_list_[i].data(), key_tensor_list_[i].data(), - rowoffset_tensor_list_[i].data(), pred_tensor_list_[i].data(), - current_batchsize_per_device, true); - } else { - inference_sessions_[i]->predict_from_device( - reader_dense_tensor_list_[i].data(), key_tensor_list_[i].data(), - rowoffset_tensor_list_[i].data(), pred_tensor_list_[i].data(), - current_batchsize_per_device, true); - } - size_t pred_output_offset = (batch * current_batch_size_ + i * current_batchsize_per_device) * - inference_parser_.label_dim; - HCTR_LIB_THROW(cudaMemcpyAsync( - pred_output + pred_output_offset, pred_tensor_list_[i].data(), - current_batchsize_per_device * inference_parser_.label_dim * sizeof(float), - cudaMemcpyDeviceToHost, resource_manager_->get_local_gpu(i)->get_stream())); - } - bar.progress(batch, num_batches); - } - for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { - CudaDeviceContext context(resource_manager_->get_local_gpu(i)->get_device_id()); - HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(i)->get_stream())); - } - timer_infer.stop(); - bar.finish(); - HCTR_LOG_S(INFO, ROOT) << "Inference time for " << num_batches - << " batches: " << timer_infer.elapsedSeconds() << std::endl; -} - -float InferenceModel::evaluate(const size_t num_batches, const std::string& source, - const DataReaderType_t data_reader_type, const Check_t check_type, - const std::vector& slot_size_array, - const DataSourceParams& data_source_params, bool reading_file_seq) { - auto print_class_aucs = [](const std::vector& class_aucs) { - if (class_aucs.size() > 1) { - HCTR_LOG_S(INFO, ROOT) << "Evaluation, AUC: {"; - for (size_t i = 0; i < class_aucs.size(); i++) { - if (i > 0) { - HCTR_PRINT(INFO, ", "); - } - HCTR_PRINT(INFO, "%f", class_aucs[i]); - } - HCTR_PRINT(INFO, "}\n"); - } - }; - - size_t batch_size_per_gpu = global_max_batch_size_ / resource_manager_->get_local_gpu_count(); - metric_.reset(new metrics::AUC(batch_size_per_gpu, num_batches, - inference_parser_.label_dim, resource_manager_)); - - reset_reader_tensor_list(); - if (inference_params_.i64_input_key) { - create_datareader()( - inference_params_, inference_parser_, data_reader_, resource_manager_, sparse_input_map_64_, - reader_label_tensor_list_, reader_dense_tensor_list_, source, data_reader_type, check_type, - slot_size_array, true, data_source_params, reading_file_seq); // repeat dataset - } else { - create_datareader()( - inference_params_, inference_parser_, data_reader_, resource_manager_, sparse_input_map_32_, - reader_label_tensor_list_, reader_dense_tensor_list_, source, data_reader_type, check_type, - slot_size_array, true, data_source_params, reading_file_seq); // repeat dataset - } - - for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { - std::vector pred_tensor_shape(pred_tensor_list_[i].shape().dims(), 0); - for (size_t j{0}; j < pred_tensor_shape.size(); ++j) { - pred_tensor_shape[j] = pred_tensor_list_[i].shape().size(j); - } - std::shared_ptr pred_buffer = - PreallocatedBuffer2::create(pred_tensor_list_[i].data(), pred_tensor_shape); - bind_tensor_to_buffer(pred_tensor_shape, pred_buffer, old_pred_tensor_list_[i]); - raw_metrics_map_list_.push_back( - {{metrics::RawType::Pred, old_pred_tensor_list_[i]->shrink()}, - {metrics::RawType::Label, - core_helper::convert_core23_tensor_to_tensorbag2(reader_label_tensor_list_[i])}}); - } - - tqdm bar; - timer_infer.start(); - for (size_t batch = 0; batch < num_batches; batch++) { - current_batch_size_ = data_reader_->read_a_batch_to_device(); - HCTR_CHECK_HINT(current_batch_size_ == global_max_batch_size_, - "there should not be incomplete batch under the repeat mode"); - metric_->set_current_batch_size(current_batch_size_); - if (inference_params_.i64_input_key) { - parse_input_from_data_reader(sparse_input_map_64_, key_tensor_list_, - rowoffset_tensor_list_); - } else { - parse_input_from_data_reader(sparse_input_map_32_, key_tensor_list_, - rowoffset_tensor_list_); - } - -#pragma omp parallel num_threads(resource_manager_->get_local_gpu_count()) - { - size_t i = omp_get_thread_num(); - CudaDeviceContext context(resource_manager_->get_local_gpu(i)->get_device_id()); - long long current_batchsize_per_device = - current_batch_size_ / resource_manager_->get_local_gpu_count(); - if (inference_params_.i64_input_key) { - inference_sessions_[i]->predict_from_device( - reader_dense_tensor_list_[i].data(), key_tensor_list_[i].data(), - rowoffset_tensor_list_[i].data(), pred_tensor_list_[i].data(), - current_batchsize_per_device, true); - } else { - inference_sessions_[i]->predict_from_device( - reader_dense_tensor_list_[i].data(), key_tensor_list_[i].data(), - rowoffset_tensor_list_[i].data(), pred_tensor_list_[i].data(), - current_batchsize_per_device, true); - } - metric_->local_reduce(i, raw_metrics_map_list_[i]); - } - metric_->global_reduce(resource_manager_->get_local_gpu_count()); - bar.progress(batch, num_batches); - } - float auc = metric_->finalize_metric(); - timer_infer.stop(); - bar.finish(); - HCTR_LOG_S(INFO, ROOT) << "Inference time for " << num_batches - << " batches: " << timer_infer.elapsedSeconds() << std::endl; - print_class_aucs(metric_->get_per_class_metric()); - return auc; -} - -template -void InferenceModel::parse_input_from_data_reader( - const std::map>& sparse_input_map, - std::vector& key_tensor_list, - std::vector& rowoffset_tensor_list) { -#pragma omp parallel num_threads(resource_manager_->get_local_gpu_count()) - { - size_t i = omp_get_thread_num(); - CudaDeviceContext context(resource_manager_->get_local_gpu(i)->get_device_id()); - size_t current_batch_size_per_gpu = - current_batch_size_ / resource_manager_->get_local_gpu_count(); - - std::vector> h_reader_rowoffset_list; - size_t value_stride = 0; - size_t rowoffset_stride = 0; - for (size_t j = 0; j < inference_parser_.num_embedding_tables; j++) { - size_t rowoffset_start = - i * current_batch_size_per_gpu * inference_parser_.slot_num_for_tables[j]; - size_t rowoffset_length = - current_batch_size_per_gpu * inference_parser_.slot_num_for_tables[j] + 1; - - core23_reader::SparseInput sparse_input; - if (!find_item_in_map(sparse_input, inference_parser_.sparse_names[j], sparse_input_map)) { - HCTR_OWN_THROW(Error_t::WrongInput, "Cannot find " + inference_parser_.sparse_names[j]); - } - core23::Tensor& value_tensor = sparse_input.evaluate_sparse_tensors[i].get_value_tensor(); - core23::Tensor& rowoffset_tensor = - sparse_input.evaluate_sparse_tensors[i].get_rowoffset_tensor(); - - std::vector h_reader_rowoffset(rowoffset_length); - std::vector h_reader_rowoffset_int(rowoffset_length); - - HCTR_LIB_THROW(cudaMemcpyAsync(h_reader_rowoffset.data(), - rowoffset_tensor.data() + rowoffset_start, - rowoffset_length * sizeof(TypeKey), cudaMemcpyDeviceToHost, - resource_manager_->get_local_gpu(i)->get_stream())); - HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(i)->get_stream())); - size_t num_keys = h_reader_rowoffset.back() - h_reader_rowoffset.front(); - if (inference_params_.i64_input_key) { - HCTR_LIB_THROW(cudaMemcpyAsync(key_tensor_list[i].data() + value_stride, - value_tensor.data() + h_reader_rowoffset.front(), - num_keys * sizeof(TypeKey), cudaMemcpyDeviceToDevice, - resource_manager_->get_local_gpu(i)->get_stream())); - } else { - HCTR_LIB_THROW( - cudaMemcpyAsync(key_tensor_list[i].data() + value_stride, - value_tensor.data() + h_reader_rowoffset.front(), - num_keys * sizeof(TypeKey), cudaMemcpyDeviceToDevice, - resource_manager_->get_local_gpu(i)->get_stream())); - } - TypeKey tmp = h_reader_rowoffset.front(); - for (auto& entry : h_reader_rowoffset) { - entry -= tmp; - } - h_reader_rowoffset_list.push_back(h_reader_rowoffset); - std::transform(h_reader_rowoffset.begin(), h_reader_rowoffset.end(), - h_reader_rowoffset_int.begin(), [](int x) { return static_cast(x); }); - HCTR_LIB_THROW(cudaMemcpyAsync(rowoffset_tensor_list[i].data() + rowoffset_stride, - h_reader_rowoffset_int.data(), rowoffset_length * sizeof(int), - cudaMemcpyHostToDevice, - resource_manager_->get_local_gpu(i)->get_stream())); - value_stride += num_keys; - rowoffset_stride += rowoffset_length; - } - HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(i)->get_stream())); - } -} - -std::tuple, int> InferenceModel::get_tensor_info_by_name( - const std::string& tensor_name) { - auto fn = [](const std::string& tensor_name, const std::vector& tensor_entries) { - for (int i{0}; i < static_cast(tensor_entries.size()); i++) { - if (tensor_entries[i].name == tensor_name) { - return i; - } - } - return -1; - }; - const int index = fn(tensor_name, inference_tensor_entries_list_[0]); - HCTR_CHECK_HINT(index != -1, "Cannot find tensor with name ", tensor_name); - - size_t local_gpu_count = resource_manager_->get_local_gpu_count(); - size_t tensor_size_in_bytes = inference_tensor_entries_list_[0][index].bag.get_size_in_bytes(); - size_t tensor_num_of_elements = get_num_elements_from_dimensions( - inference_tensor_entries_list_[0][index].bag.get_dimensions()); - auto dimensions = inference_tensor_entries_list_[0][index].bag.get_dimensions(); - dimensions[0] *= local_gpu_count; - return std::make_tuple(local_gpu_count * tensor_size_in_bytes, - local_gpu_count * tensor_num_of_elements, dimensions, index); -} - -void InferenceModel::check_out_tensor(int index, float* local_result) { - const int local_gpu_count = resource_manager_->get_local_gpu_count(); - size_t tensor_size_in_bytes = inference_tensor_entries_list_[0][index].bag.get_size_in_bytes(); - size_t tensor_num_of_elements = get_num_elements_from_dimensions( - inference_tensor_entries_list_[0][index].bag.get_dimensions()); - size_t bytes_per_element = tensor_size_in_bytes / tensor_num_of_elements; - - if (bytes_per_element == 4) { - for (int local_gpu_id{}; local_gpu_id < local_gpu_count; ++local_gpu_id) { - HCTR_LIB_THROW(cudaMemcpy(local_result + local_gpu_id * tensor_num_of_elements, - inference_tensor_entries_list_[local_gpu_id][index].bag.get_ptr(), - tensor_size_in_bytes, cudaMemcpyDeviceToHost)); - } - } else { - std::unique_ptr<__half[]> local_result_half( - new __half[local_gpu_count * tensor_num_of_elements]); - for (int local_gpu_id{}; local_gpu_id < local_gpu_count; ++local_gpu_id) { - HCTR_LIB_THROW(cudaMemcpy(local_result_half.get() + local_gpu_id * tensor_num_of_elements, - inference_tensor_entries_list_[local_gpu_id][index].bag.get_ptr(), - tensor_size_in_bytes, cudaMemcpyDeviceToHost)); - } - auto transform = [](float* dst_ptr, const __half* src_ptr, size_t num_of_elements) { - for (size_t i{0}; i < num_of_elements; ++i) { - dst_ptr[i] = static_cast(src_ptr[i]); - } - }; - transform(local_result, local_result_half.get(), local_gpu_count * tensor_num_of_elements); - } -} - -} // namespace HugeCTR diff --git a/HugeCTR/src/pybind/model.cpp b/HugeCTR/src/pybind/model.cpp index 2cf79de6f3..47323c2e59 100644 --- a/HugeCTR/src/pybind/model.cpp +++ b/HugeCTR/src/pybind/model.cpp @@ -1865,8 +1865,8 @@ void Model::fit(int num_epochs, int max_iter, int display, int eval_interval, in HCTR_LOG_S(INFO, ROOT) << "Eval Time for " << solver_.max_eval_batches << " iters: " << timer_eval.elapsedSeconds() << "s" << std::endl; } - if (snapshot > 0 && iter % snapshot == 0 && iter != 0) { - this->download_params_to_files(snapshot_prefix, iter); + if (snapshot > 0 && (iter + 1) % snapshot == 0 && iter != 0) { + this->download_params_to_files(snapshot_prefix, iter + 1); } iter++; } while (data_reader_train_status_); @@ -2092,8 +2092,8 @@ void Model::fit(int num_epochs, int max_iter, int display, int eval_interval, in float(iter) / max_iter); // use iteration to calculate it's in which epoch } } - if (snapshot > 0 && iter % snapshot == 0 && iter != 0) { - this->download_params_to_files(snapshot_prefix, iter); + if (snapshot > 0 && (iter + 1) % snapshot == 0 && iter != 0) { + this->download_params_to_files(snapshot_prefix, iter + 1); } } // end for iter for (auto tc : training_callbacks_) { diff --git a/HugeCTR/src/pybind/module_main.cpp b/HugeCTR/src/pybind/module_main.cpp index 30ae9467f6..84adb25a8d 100644 --- a/HugeCTR/src/pybind/module_main.cpp +++ b/HugeCTR/src/pybind/module_main.cpp @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -43,7 +42,6 @@ PYBIND11_MODULE(hugectr, m) { LearningRateSchedulerPybind(m); OptimizerPybind(m); ModelPybind(m); - InferencePybind(m); EmbeddingCollectionPybind(m); HPSPybind(m); TrainingCallbackPybind(m); diff --git a/ci/benchmark/hps_memory_check/test.sh b/ci/benchmark/hps_memory_check/test.sh index 8307c6f716..043169b951 100644 --- a/ci/benchmark/hps_memory_check/test.sh +++ b/ci/benchmark/hps_memory_check/test.sh @@ -2,7 +2,7 @@ python3 ${WORKDIR}/ci/common/generate_inference_config.py --config_template ${WORKDIR}/ci/common/config_pbtxt_template.txt --ps_template ${WORKDIR}/ci/common/ps_template.json --batchsize 64 --mixed_precision false --ec_type dynamic --config_output /model/dlrm/config.pbtxt --ps_output /model/ps.json -tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hugectr,ps=/model/ps.json > /dev/null 2> /dev/null & +tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/model/ps.json > /dev/null 2> /dev/null & echo > /logs/cpu_dynamic_mem.log while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do (top -d 1 -n 10 -b | grep triton) >> /logs/cpu_dynamic_mem.log @@ -11,7 +11,7 @@ kill -s 9 `pgrep tritonserver` sleep 10; python3 ${WORKDIR}/ci/common/generate_inference_config.py --config_template ${WORKDIR}/ci/common/config_pbtxt_template.txt --ps_template ${WORKDIR}/ci/common/ps_template.json --batchsize 64 --mixed_precision false --ec_type uvm --config_output /model/dlrm/config.pbtxt --ps_output /model/ps.json -tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hugectr,ps=/model/ps.json > /dev/null 2> /dev/null & +tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/model/ps.json > /dev/null 2> /dev/null & echo > /logs/cpu_uvm_mem.log while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do (top -d 1 -n 10 -b | grep triton) >> /logs/cpu_uvm_mem.log @@ -20,7 +20,7 @@ kill -s 9 `pgrep tritonserver` sleep 10; python3 ${WORKDIR}/ci/common/generate_inference_config.py --config_template ${WORKDIR}/ci/common/config_pbtxt_template.txt --ps_template ${WORKDIR}/ci/common/ps_template.json --batchsize 64 --mixed_precision false --ec_type static --config_output /model/dlrm/config.pbtxt --ps_output /model/ps.json -tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hugectr,ps=/model/ps.json > /dev/null 2> /dev/null & +tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/model/ps.json > /dev/null 2> /dev/null & echo > /logs/cpu_static_mem.log while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do (top -d 1 -n 10 -b | grep triton) >> /logs/cpu_static_mem.log diff --git a/ci/benchmark/inference_benchmark/ci.yml b/ci/benchmark/inference_benchmark/ci.yml index 6399bfeca2..790b142876 100644 --- a/ci/benchmark/inference_benchmark/ci.yml +++ b/ci/benchmark/inference_benchmark/ci.yml @@ -2,7 +2,7 @@ infernece--256xFP32: extends: .inference_benchmark infernece--1024xFP32: extends: .inference_benchmark -infernece--8192xFP16: - extends: .inference_benchmark -infernece--131072xFP16: - extends: .inference_benchmark +#infernece--8192xFP16: +# extends: .inference_benchmark +#infernece--131072xFP16: +# extends: .inference_benchmark diff --git a/ci/benchmark/inference_benchmark/test.sh b/ci/benchmark/inference_benchmark/test.sh index 12d641a5aa..b24f473eea 100644 --- a/ci/benchmark/inference_benchmark/test.sh +++ b/ci/benchmark/inference_benchmark/test.sh @@ -2,10 +2,10 @@ python3 ${WORKDIR}/ci/common/generate_inference_config.py --config_template ${WORKDIR}/ci/common/config_pbtxt_template.txt --ps_template ${WORKDIR}/ci/common/ps_template.json --batchsize ${BZ} --mixed_precision ${MIXED_PRECISION} --config_output /model/dlrm/config.pbtxt --ps_output /model/ps.json -tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hugectr,ps=/model/ps.json > /dev/null 2> /dev/null & +tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hps,ps=/model/ps.json > /dev/null 2> /dev/null & #tritonserver --model-repository=/model/ --load-model=dlrm --log-verbose=1 --model-control-mode=explicit --backend-directory=/usr/local/hugectr/backends --backend-config=hugectr,ps=/model/ps.json & while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do sleep 10; done -perf_analyzer -m dlrm -u localhost:8000 --input-data /perf_data/${BZ}.json --shape CATCOLUMN:${CATCOLUMN} --shape DES:${DES} --shape ROWINDEX:${ROWINDEX} +perf_analyzer -m dlrm -u localhost:8000 --input-data /perf_data/${BZ}.json --shape KEYS:${CATCOLUMN} --shape NUMKEYS:1 diff --git a/ci/common/config_pbtxt_template.txt b/ci/common/config_pbtxt_template.txt index 3d3489c22d..a08fed7538 100644 --- a/ci/common/config_pbtxt_template.txt +++ b/ci/common/config_pbtxt_template.txt @@ -1,19 +1,14 @@ name: "dlrm" -backend: "hugectr" +backend: "hps" max_batch_size:%%batchsize, input [ - { - name: "DES" - data_type: TYPE_FP32 - dims: [ -1 ] - }, { - name: "CATCOLUMN" + name: "KEYS" data_type: TYPE_INT64 dims: [ -1 ] }, { - name: "ROWINDEX" + name: "NUMKEYS" data_type: TYPE_INT32 dims: [ -1 ] } diff --git a/ci/common/generate_inference_config.py b/ci/common/generate_inference_config.py index d5db9c5787..28991fad44 100644 --- a/ci/common/generate_inference_config.py +++ b/ci/common/generate_inference_config.py @@ -29,7 +29,7 @@ def str2bool(v): return v.lower() in ("true") -ps_json_template["models"][0]["max_batch_size"] = args.batchsize +ps_json_template["models"][0]["max_batch_size"] = int(args.batchsize) ps_json_template["models"][0]["mixed_precision"] = str2bool(args.mixed_precision) ps_json_template["models"][0]["embedding_cache_type"] = args.ec_type diff --git a/ci/common/ps_template.json b/ci/common/ps_template.json index ceffb666f7..7743b23773 100644 --- a/ci/common/ps_template.json +++ b/ci/common/ps_template.json @@ -21,18 +21,19 @@ "sparse_files":["/model/dlrm/1/dlrm0_sparse_20000.model"], "dense_file":"/model/dlrm/1/dlrm_dense_20000.model", "network_file":"/model/dlrm/1/dlrm.json", - "num_of_worker_buffer_in_pool": "6", - "num_of_refresher_buffer_in_pool":"1", - "deployed_device_list":["0"], - "max_batch_size":"1", + "num_of_worker_buffer_in_pool": 6, + "num_of_refresher_buffer_in_pool":1, + "deployed_device_list":[0], + "max_batch_size":1, "mixed_precision":true, - "default_value_for_each_table":["0.0"], - "cache_refresh_percentage_per_iteration":"0.2", - "hit_rate_threshold":"1.1", - "gpucacheper":"0.5", + "default_value_for_each_table":[0.0], + "cache_refresh_percentage_per_iteration":0.2, + "hit_rate_threshold":1.1, + "gpucacheper":0.5, "maxnum_catfeature_query_per_table_per_sample":[26], "embedding_vecsize_per_table":[128], - "gpucache":"true" + "gpucache":true } ] } + diff --git a/ci/dracorno/ci.yml b/ci/dracorno/ci.yml index 08242b9141..b96a8ba492 100644 --- a/ci/dracorno/ci.yml +++ b/ci/dracorno/ci.yml @@ -41,18 +41,6 @@ utests_layer_2: # MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT},${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/inference:/hugectr/test/utest/ # TEST_CMD: ./ci/utest/utest_embedding.sub -utests_inference: - extends: .dracorno_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_inference - variables: - GPFSFOLDER: $DRACO_LOGDIR/utests_inference - CONT: $TRAIN_INFER_IMAGE_VERSIONED - MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT},${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/inference:/hugectr/test/utest/ - DGXNNODES: 1 - TEST_CMD: ./ci/utest/utest_inference.sub - criteo: extends: .dracorno_test_job needs: diff --git a/ci/integration_test/inference/inference_model.sub b/ci/integration_test/inference/inference_model.sub deleted file mode 100644 index 893176ae8f..0000000000 --- a/ci/integration_test/inference/inference_model.sub +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -set -e - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - mkdir -p /dump_infer && - cd ${NEW_CRITEO_MOUNT} && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/inference/inference_model/cross_entropy_loss.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/inference/inference_model/multi_cross_entropy_loss.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/inference/inference_model/dcn_one_hot.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/inference/inference_model/wdl_one_hot.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/inference/inference_model/dlrm_mlp_one_hot.py" -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - mkdir -p /dump_infer && - cd ${NEW_CRITEO_MOUNT} && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/inference/inference_model/dcn_multi_hot.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/inference/inference_model/wdl_multi_hot.py && - mpirun -np 1 --allow-run-as-root python3 /workdir/test/inference/inference_model/synthetic_multi_hot.py" diff --git a/ci/integration_test/inference/inference_session.sub b/ci/integration_test/inference/inference_session.sub deleted file mode 100755 index a13e33751b..0000000000 --- a/ci/integration_test/inference/inference_session.sub +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/dcn && \ - python3 /workdir/test/inference/inference_session/dcn_inference.py /workdir/test/scripts/dcn_inference.json DCN /hugectr/test/utest/dcn_csr.txt && \ - python3 /workdir/test/inference/inference_session/wdl_multitable_test.py wdl /hugectr/test/utest/wdl_test_files/wdl_infer.new.json /hugectr/test/utest/wdl_test_files/wdl_dense_2000.model /hugectr/test/utest/wdl_test_files/wdl0_sparse_2000.model,/hugectr/test/utest/wdl_test_files/wdl1_sparse_2000.model /hugectr/test/utest/wdl_test_files/first_ten.csv && \ - python3 /workdir/test/inference/inference_session/movielens_nodense_test.py movielens_hugectr /hugectr/test/utest/movie_test_files/movielens.json /hugectr/test/utest/movie_test_files/_dense_1900.model /hugectr/test/utest/movie_test_files/0_sparse_1900.model /hugectr/test/utest/movie_test_files/test.parquet" diff --git a/ci/selene/ci.yml b/ci/selene/ci.yml index 1ae43a980b..a6ed5d0948 100644 --- a/ci/selene/ci.yml +++ b/ci/selene/ci.yml @@ -75,17 +75,6 @@ utests_hybrid_e2e: MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/ TEST_CMD: ./ci/utest/utest_hybrid_e2e.sub -utests_inference: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_inference - variables: - GPFSFOLDER: $LOGDIR/utests_inference - CONT: $TRAIN_INFER_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/ - TEST_CMD: ./ci/utest/utest_inference.sub - utests_hps: extends: .selene_test_job needs: @@ -216,18 +205,6 @@ mlperf_generalization: WALLTIME: "00:15:00" TEST_CMD: ./ci/integration_test/mlperf_generalization/overlapped_pipeline.sub -# python interface inference -inference_session: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/inference_session - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT},/lustre/fsw/devtech/hpc-hugectr/inference/:/hugectr/test/utest/ - TEST_CMD: ./ci/integration_test/inference/inference_session.sub - inference_hps: extends: .selene_test_job needs: @@ -312,18 +289,6 @@ ebc_utest_multi_node: DGXNNODES: 2 TEST_CMD: ./ci/integration_test/ebc/utest.multinode.sub -# hugectr inference correctness test -inference_model: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/inference_model - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT} - TEST_CMD: ./ci/integration_test/inference/inference_model.sub - # hugectr to onnx converter test hugectr2onnx: extends: .selene_test_job diff --git a/ci/template.yml b/ci/template.yml index 8fc51f28f1..aa60179d87 100644 --- a/ci/template.yml +++ b/ci/template.yml @@ -169,9 +169,9 @@ stages: - if [[ "$BUILD_HUGECTR2ONNX" == 1 ]]; then echo "RUN cd /workdir/onnx_converter && python3 setup.py install" >> ${JOB_DOCKERFILE}; fi - - if [[ "$BUILD_HUGECTR_BACKEND" == 1 ]]; then - echo "RUN git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/dl/hugectr/hugectr_inference_backend.git hugectr_inference_backend && cd hugectr_inference_backend && git checkout hugectr_performance_test && mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=/usr/local/hugectr -DTRITON_COMMON_REPO_TAG=$TRITON_BRANCH -DTRITON_CORE_REPO_TAG=$TRITON_BRANCH -DTRITON_BACKEND_REPO_TAG=$TRITON_BRANCH .. && make -j\$(nproc) && make install && cd ../.. && rm -rfv hugectr_inference_backend" >> ${JOB_DOCKERFILE}; - fi + #- if [[ "$BUILD_HPS_BACKEND" == 1 ]]; then + # echo "RUN git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/dl/hugectr/hugectr_inference_backend.git hugectr_inference_backend && cd hugectr_inference_backend && git checkout hugectr_performance_test && mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=/usr/local/hugectr -DTRITON_COMMON_REPO_TAG=$TRITON_BRANCH -DTRITON_CORE_REPO_TAG=$TRITON_BRANCH -DTRITON_BACKEND_REPO_TAG=$TRITON_BRANCH .. && make -j\$(nproc) && make install && cd ../.. && rm -rfv hugectr_inference_backend" >> ${JOB_DOCKERFILE}; + # fi - if [[ "$BUILD_HPS_BACKEND" == 1 ]]; then echo "RUN git clone --branch $HUGECTR_BACKEND_VER https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/dl/hugectr/hugectr_inference_backend.git hugectr_inference_backend && cd hugectr_inference_backend/hps_backend && mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=/usr/local/hugectr -DTRITON_COMMON_REPO_TAG=$TRITON_BRANCH -DTRITON_CORE_REPO_TAG=$TRITON_BRANCH -DTRITON_BACKEND_REPO_TAG=$TRITON_BRANCH .. && make -j\$(nproc) && make install && cd ../../.. && rm -rfv hugectr_inference_backend" >> ${JOB_DOCKERFILE}; echo "RUN ln -s /usr/local/hugectr/backends/hps /opt/tritonserver/backends/hps" >> ${JOB_DOCKERFILE}; diff --git a/notebooks/README.md b/notebooks/README.md index f800fe0ae5..756c23d6a2 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -99,7 +99,7 @@ The notebooks are located within the container and can be found in the `/HugeCTR Here's a list of notebooks that you can run: - [hugectr_e2e_demo_with_nvtabular.ipynb](hugectr_e2e_demo.ipynb): Notebook to preprocess data using NVTabular, train the model with HugeCTR, and do the offline inference with the HugeCTR HPS. - [continuous_training.ipynb](continuous_training.ipynb): Notebook to introduce how to deploy continued training with HugeCTR. -- [multi_gpu_offline_inference.ipynb](multi_gpu_offline_inference.ipynb): Explain how to do multi-GPU offline inference with HugeCTR Python APIs. +- ~multi_gpu_offline_inference.ipynb~: It was deprecated. Check out [this HPS TRT notebook](hps_trt/notebooks/demo_for_tf_trained_model.ipynb) as an alternative. - [hps_demo.ipynb](hps_demo.ipynb): Demonstrate how to utilize HPS Python APIs together with ONNX Runtime APIs to create an ensemble inference model. - [training_and_inference_with_remote_filesystem.ipynb](training_and_inference_with_remote_filesystem.ipynb): Demonstrates how to train a model with data that is stored in a remote file system such as Hadoop HDFS and AWS S3. @@ -118,7 +118,6 @@ The specifications of the system on which each notebook can run successfully are | ---------------------------------------------------------------------- | ------------------------------------------------------------ | -------------------------------- | ----- | -------------- | | [multi-modal-data](multi-modal-data) | Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
512 GB Memory | Tesla V100-SXM2-32GB
32 GB Memory | 1 | Vinh Nguyen | | [continuous_training.ipynb](continuous_training.ipynb) | Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
512 GB Memory | Tesla V100-SXM2-32GB
32 GB Memory | 1 | Xiaolei Shi | -| [multi_gpu_offline_inference.ipynb](multi_gpu_offline_inference.ipynb) | Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
512 GB Memory | Tesla V100-SXM2-32GB
32 GB Memory | 4 | Kingsley Liu | | [hps_demo.ipynb](hps_demo.ipynb) | Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
512 GB Memory | Tesla V100-SXM2-32GB
32 GB Memory | 1 | Kingsley Liu | | [training_with_remote_filesystem.ipynb](training_with_remote_filesystem.ipynb) | Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
512 GB Memory | Tesla V100-SXM2-32GB
32 GB Memory | 1 | Jerry Shi | -| [hugectr_e2e_demo_with_nvtabular.ipynb](hugectr_e2e_demo_with_nvtabular.ipynb) | Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
512 GB Memory | Tesla V100-SXM2-32GB
32 GB Memory | 1 | Jerry Shi | \ No newline at end of file +| [hugectr_e2e_demo_with_nvtabular.ipynb](hugectr_e2e_demo_with_nvtabular.ipynb) | Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
512 GB Memory | Tesla V100-SXM2-32GB
32 GB Memory | 1 | Jerry Shi | diff --git a/notebooks/multi_gpu_offline_inference.ipynb b/notebooks/multi_gpu_offline_inference.ipynb deleted file mode 100755 index d2f72b6bb1..0000000000 --- a/notebooks/multi_gpu_offline_inference.ipynb +++ /dev/null @@ -1,633 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "cdfec37b", - "metadata": {}, - "outputs": [], - "source": [ - "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# http://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License.\n", - "# ==============================================================================\n", - "\n", - "# Each user is responsible for checking the content of datasets and the\n", - "# applicable licenses and determining if suitable for the intended use." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f75c838b", - "metadata": {}, - "source": [ - "\n", - "\n", - "# Multi-GPU Offline Inference" - ] - }, - { - "cell_type": "markdown", - "id": "ea4ae25b", - "metadata": {}, - "source": [ - "> **Deprecation Warning**: this Notebook is based on the offline inference API `InferenceModel`, which will be deprecated in a future release. Please check out the [Hierarchical Parameter Server](https://nvidia-merlin.github.io/HugeCTR/main/hierarchical_parameter_server/index.html) for alternatives based on TensorFlow and TensorRT.\n", - "\n", - "## Overview\n", - "\n", - "In HugeCTR version 3.4.1, we provide Python APIs to perform multi-GPU offline inference.\n", - "This work leverages the [HugeCTR Hierarchical Parameter Server](https://nvidia-merlin.github.io/HugeCTR/master/hugectr_core_features.html#hierarchical-parameter-server) and enables concurrent execution on multiple devices.\n", - "The `Norm` or `Parquet` dataset format is currently supported by multi-GPU offline inference.\n", - "\n", - "This notebook explains how to perform multi-GPU offline inference with the HugeCTR Python APIs.\n", - "For more details about the API, see the [HugeCTR Python Interface](https://nvidia-merlin.github.io/HugeCTR/master/api/python_interface.html#inference-api) documentation." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "902f3ef1", - "metadata": {}, - "source": [ - "## Setup\n", - "\n", - "To setup the environment, refer to [HugeCTR Example Notebooks](../notebooks) and follow the instructions there before running the following." - ] - }, - { - "cell_type": "markdown", - "id": "240b78ac", - "metadata": {}, - "source": [ - "## Data Generation\n", - "\n", - "HugeCTR provides a tool to generate synthetic datasets.\n", - "The [Data Generator](https://nvidia-merlin.github.io/HugeCTR/master/api/python_interface.html#data-generator-api) class is capable of generating datasets in different formats and with different distributions.\n", - "We will generate multi-hot Parquet datasets with a power-law distribution for this notebook:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "db37ef07", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[HCTR][08:59:54.134][INFO][RK0][main]: Generate Parquet dataset\n", - "[HCTR][08:59:54.134][INFO][RK0][main]: train data folder: ./multi_hot_parquet, eval data folder: ./multi_hot_parquet, slot_size_array: 10000, 10000, 10000, nnz array: 2, 1, 3, #files for train: 32, #files for eval: 8, #samples per file: 40960, Use power law distribution: 1, alpha of power law: 1.3\n", - "[HCTR][08:59:54.136][INFO][RK0][main]: ./multi_hot_parquet exist\n", - "[HCTR][08:59:54.140][INFO][RK0][main]: ./multi_hot_parquet/train/gen_0.parquet\n", - "[HCTR][08:59:55.615][INFO][RK0][main]: ./multi_hot_parquet/train/gen_1.parquet\n", - "[HCTR][08:59:55.850][INFO][RK0][main]: ./multi_hot_parquet/train/gen_2.parquet\n", - "[HCTR][08:59:56.078][INFO][RK0][main]: ./multi_hot_parquet/train/gen_3.parquet\n", - "[HCTR][08:59:56.311][INFO][RK0][main]: ./multi_hot_parquet/train/gen_4.parquet\n", - "[HCTR][08:59:56.534][INFO][RK0][main]: ./multi_hot_parquet/train/gen_5.parquet\n", - "[HCTR][08:59:56.770][INFO][RK0][main]: ./multi_hot_parquet/train/gen_6.parquet\n", - "[HCTR][08:59:56.959][INFO][RK0][main]: ./multi_hot_parquet/train/gen_7.parquet\n", - "[HCTR][08:59:57.152][INFO][RK0][main]: ./multi_hot_parquet/train/gen_8.parquet\n", - "[HCTR][08:59:57.309][INFO][RK0][main]: ./multi_hot_parquet/train/gen_9.parquet\n", - "[HCTR][08:59:57.496][INFO][RK0][main]: ./multi_hot_parquet/train/gen_10.parquet\n", - "[HCTR][08:59:57.671][INFO][RK0][main]: ./multi_hot_parquet/train/gen_11.parquet\n", - "[HCTR][08:59:57.879][INFO][RK0][main]: ./multi_hot_parquet/train/gen_12.parquet\n", - "[HCTR][08:59:58.069][INFO][RK0][main]: ./multi_hot_parquet/train/gen_13.parquet\n", - "[HCTR][08:59:58.240][INFO][RK0][main]: ./multi_hot_parquet/train/gen_14.parquet\n", - "[HCTR][08:59:58.423][INFO][RK0][main]: ./multi_hot_parquet/train/gen_15.parquet\n", - "[HCTR][08:59:58.619][INFO][RK0][main]: ./multi_hot_parquet/train/gen_16.parquet\n", - "[HCTR][08:59:58.833][INFO][RK0][main]: ./multi_hot_parquet/train/gen_17.parquet\n", - "[HCTR][08:59:59.017][INFO][RK0][main]: ./multi_hot_parquet/train/gen_18.parquet\n", - "[HCTR][08:59:59.176][INFO][RK0][main]: ./multi_hot_parquet/train/gen_19.parquet\n", - "[HCTR][08:59:59.358][INFO][RK0][main]: ./multi_hot_parquet/train/gen_20.parquet\n", - "[HCTR][08:59:59.527][INFO][RK0][main]: ./multi_hot_parquet/train/gen_21.parquet\n", - "[HCTR][08:59:59.722][INFO][RK0][main]: ./multi_hot_parquet/train/gen_22.parquet\n", - "[HCTR][08:59:59.939][INFO][RK0][main]: ./multi_hot_parquet/train/gen_23.parquet\n", - "[HCTR][09:00:00.107][INFO][RK0][main]: ./multi_hot_parquet/train/gen_24.parquet\n", - "[HCTR][09:00:00.294][INFO][RK0][main]: ./multi_hot_parquet/train/gen_25.parquet\n", - "[HCTR][09:00:00.509][INFO][RK0][main]: ./multi_hot_parquet/train/gen_26.parquet\n", - "[HCTR][09:00:00.695][INFO][RK0][main]: ./multi_hot_parquet/train/gen_27.parquet\n", - "[HCTR][09:00:00.955][INFO][RK0][main]: ./multi_hot_parquet/train/gen_28.parquet\n", - "[HCTR][09:00:01.190][INFO][RK0][main]: ./multi_hot_parquet/train/gen_29.parquet\n", - "[HCTR][09:00:01.365][INFO][RK0][main]: ./multi_hot_parquet/train/gen_30.parquet\n", - "[HCTR][09:00:01.509][INFO][RK0][main]: ./multi_hot_parquet/train/gen_31.parquet\n", - "[HCTR][09:00:01.698][INFO][RK0][main]: ./multi_hot_parquet/file_list.txt done!\n", - "[HCTR][09:00:01.708][INFO][RK0][main]: ./multi_hot_parquet/val/gen_0.parquet\n", - "[HCTR][09:00:01.895][INFO][RK0][main]: ./multi_hot_parquet/val/gen_1.parquet\n", - "[HCTR][09:00:02.062][INFO][RK0][main]: ./multi_hot_parquet/val/gen_2.parquet\n", - "[HCTR][09:00:02.255][INFO][RK0][main]: ./multi_hot_parquet/val/gen_3.parquet\n", - "[HCTR][09:00:02.472][INFO][RK0][main]: ./multi_hot_parquet/val/gen_4.parquet\n", - "[HCTR][09:00:02.665][INFO][RK0][main]: ./multi_hot_parquet/val/gen_5.parquet\n", - "[HCTR][09:00:02.888][INFO][RK0][main]: ./multi_hot_parquet/val/gen_6.parquet\n", - "[HCTR][09:00:03.110][INFO][RK0][main]: ./multi_hot_parquet/val/gen_7.parquet\n", - "[HCTR][09:00:03.303][INFO][RK0][main]: ./multi_hot_parquet/file_list_test.txt done!\n" - ] - } - ], - "source": [ - "import hugectr\n", - "from hugectr.tools import DataGeneratorParams, DataGenerator\n", - "\n", - "data_generator_params = DataGeneratorParams(\n", - " format = hugectr.DataReaderType_t.Parquet,\n", - " label_dim = 2,\n", - " dense_dim = 2,\n", - " num_slot = 3,\n", - " i64_input_key = True,\n", - " nnz_array = [2, 1, 3],\n", - " source = \"./multi_hot_parquet/file_list.txt\",\n", - " eval_source = \"./multi_hot_parquet/file_list_test.txt\",\n", - " slot_size_array = [10000, 10000, 10000],\n", - " check_type = hugectr.Check_t.Non,\n", - " dist_type = hugectr.Distribution_t.PowerLaw,\n", - " power_law_type = hugectr.PowerLaw_t.Short,\n", - " num_files = 32,\n", - " eval_num_files = 8)\n", - "data_generator = DataGenerator(data_generator_params)\n", - "data_generator.generate()" - ] - }, - { - "cell_type": "markdown", - "id": "19109028", - "metadata": {}, - "source": [ - "## Train from Scratch\n", - "\n", - "We can train from scratch by performing the following steps with Python APIs:\n", - "\n", - "1. Create the solver, reader and optimizer, then initialize the model.\n", - "2. Construct the model graph by adding input, sparse embedding and dense layers in order.\n", - "3. Compile the model and have an overview of the model graph.\n", - "4. Dump the model graph to a JSON file.\n", - "5. Fit the model, save the model weights and optimizer states implicitly.\n", - "6. Dump one batch of evaluation results to files." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "e2b0d9d6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting multi_hot_train.py\n" - ] - } - ], - "source": [ - "%%writefile multi_hot_train.py\n", - "import hugectr\n", - "from mpi4py import MPI\n", - "solver = hugectr.CreateSolver(model_name = \"multi_hot\",\n", - " max_eval_batches = 1,\n", - " batchsize_eval = 131072,\n", - " batchsize = 16384,\n", - " lr = 0.001,\n", - " vvgpu = [[0]],\n", - " i64_input_key = True,\n", - " repeat_dataset = True,\n", - " use_cuda_graph = True)\n", - "reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet,\n", - " source = [\"./multi_hot_parquet/file_list.txt\"],\n", - " eval_source = \"./multi_hot_parquet/file_list_test.txt\",\n", - " check_type = hugectr.Check_t.Non,\n", - " slot_size_array = [10000, 10000, 10000])\n", - "optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam)\n", - "model = hugectr.Model(solver, reader, optimizer)\n", - "model.add(hugectr.Input(label_dim = 2, label_name = \"label\",\n", - " dense_dim = 2, dense_name = \"dense\",\n", - " data_reader_sparse_param_array = \n", - " [hugectr.DataReaderSparseParam(\"data1\", [2, 1], False, 2),\n", - " hugectr.DataReaderSparseParam(\"data2\", 3, False, 1),]))\n", - "model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, \n", - " workspace_size_per_gpu_in_mb = 4,\n", - " embedding_vec_size = 16,\n", - " combiner = \"sum\",\n", - " sparse_embedding_name = \"sparse_embedding1\",\n", - " bottom_name = \"data1\",\n", - " optimizer = optimizer))\n", - "model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, \n", - " workspace_size_per_gpu_in_mb = 2,\n", - " embedding_vec_size = 16,\n", - " combiner = \"sum\",\n", - " sparse_embedding_name = \"sparse_embedding2\",\n", - " bottom_name = \"data2\",\n", - " optimizer = optimizer))\n", - "model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,\n", - " bottom_names = [\"sparse_embedding1\"],\n", - " top_names = [\"reshape1\"],\n", - " leading_dim=32)) \n", - "model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,\n", - " bottom_names = [\"sparse_embedding2\"],\n", - " top_names = [\"reshape2\"],\n", - " leading_dim=16)) \n", - "model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,\n", - " bottom_names = [\"reshape1\", \"reshape2\", \"dense\"], top_names = [\"concat1\"]))\n", - "model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,\n", - " bottom_names = [\"concat1\"],\n", - " top_names = [\"fc1\"],\n", - " num_output=1024))\n", - "model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,\n", - " bottom_names = [\"fc1\"],\n", - " top_names = [\"relu1\"]))\n", - "model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,\n", - " bottom_names = [\"relu1\"],\n", - " top_names = [\"fc2\"],\n", - " num_output=2))\n", - "model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.MultiCrossEntropyLoss,\n", - " bottom_names = [\"fc2\", \"label\"],\n", - " top_names = [\"loss\"],\n", - " target_weight_vec = [0.5, 0.5]))\n", - "model.compile()\n", - "model.summary()\n", - "model.graph_to_json(\"multi_hot.json\")\n", - "model.fit(max_iter = 1100, display = 200, eval_interval = 1000, snapshot = 1000, snapshot_prefix = \"multi_hot\")\n", - "model.export_predictions(\"multi_hot_pred_\" + str(1000), \"multi_hot_label_\" + str(1000))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d0f29350", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "HugeCTR Version: 3.7\n", - "====================================================Model Init=====================================================\n", - "[HCTR][09:00:10.032][INFO][RK0][main]: Initialize model: multi_hot\n", - "[HCTR][09:00:10.032][INFO][RK0][main]: Global seed is 69819197\n", - "[HCTR][09:00:10.135][INFO][RK0][main]: Device to NUMA mapping:\n", - " GPU 0 -> node 0\n", - "[HCTR][09:00:11.978][WARNING][RK0][main]: Peer-to-peer access cannot be fully enabled.\n", - "[HCTR][09:00:11.978][INFO][RK0][main]: Start all2all warmup\n", - "[HCTR][09:00:11.978][INFO][RK0][main]: End all2all warmup\n", - "[HCTR][09:00:11.979][INFO][RK0][main]: Using All-reduce algorithm: NCCL\n", - "[HCTR][09:00:11.980][INFO][RK0][main]: Device 0: Tesla V100-SXM2-32GB\n", - "[HCTR][09:00:11.985][INFO][RK0][main]: num of DataReader workers for train: 1\n", - "[HCTR][09:00:11.985][INFO][RK0][main]: num of DataReader workers for eval: 1\n", - "[HCTR][09:00:12.176][INFO][RK0][main]: Vocabulary size: 30000\n", - "[HCTR][09:00:12.177][INFO][RK0][main]: max_vocabulary_size_per_gpu_=21845\n", - "[HCTR][09:00:12.179][INFO][RK0][main]: max_vocabulary_size_per_gpu_=10922\n", - "[HCTR][09:00:12.181][INFO][RK0][main]: Graph analysis to resolve tensor dependency\n", - "===================================================Model Compile===================================================\n", - "[HCTR][09:00:43.965][INFO][RK0][main]: gpu0 start to init embedding\n", - "[HCTR][09:00:43.965][INFO][RK0][main]: gpu0 init embedding done\n", - "[HCTR][09:00:43.965][INFO][RK0][main]: gpu0 start to init embedding\n", - "[HCTR][09:00:43.965][INFO][RK0][main]: gpu0 init embedding done\n", - "[HCTR][09:00:43.969][INFO][RK0][main]: Starting AUC NCCL warm-up\n", - "[HCTR][09:00:43.972][INFO][RK0][main]: Warm-up done\n", - "===================================================Model Summary===================================================\n", - "[HCTR][09:00:43.972][INFO][RK0][main]: label Dense Sparse \n", - "label dense data1,data2 \n", - "(None, 2) (None, 2) \n", - "——————————————————————————————————————————————————————————————————————————————————————————————————————————————————\n", - "Layer Type Input Name Output Name Output Shape \n", - "——————————————————————————————————————————————————————————————————————————————————————————————————————————————————\n", - "DistributedSlotSparseEmbeddingHash data1 sparse_embedding1 (None, 2, 16) \n", - "------------------------------------------------------------------------------------------------------------------\n", - "DistributedSlotSparseEmbeddingHash data2 sparse_embedding2 (None, 1, 16) \n", - "------------------------------------------------------------------------------------------------------------------\n", - "Reshape sparse_embedding1 reshape1 (None, 32) \n", - "------------------------------------------------------------------------------------------------------------------\n", - "Reshape sparse_embedding2 reshape2 (None, 16) \n", - "------------------------------------------------------------------------------------------------------------------\n", - "Concat reshape1 concat1 (None, 50) \n", - " reshape2 \n", - " dense \n", - "------------------------------------------------------------------------------------------------------------------\n", - "InnerProduct concat1 fc1 (None, 1024) \n", - "------------------------------------------------------------------------------------------------------------------\n", - "ReLU fc1 relu1 (None, 1024) \n", - "------------------------------------------------------------------------------------------------------------------\n", - "InnerProduct relu1 fc2 (None, 2) \n", - "------------------------------------------------------------------------------------------------------------------\n", - "MultiCrossEntropyLoss fc2 loss \n", - " label \n", - "------------------------------------------------------------------------------------------------------------------\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: Save the model graph to multi_hot.json successfully\n", - "=====================================================Model Fit=====================================================\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: Use non-epoch mode with number of iterations: 1100\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: Training batchsize: 16384, evaluation batchsize: 131072\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: Evaluation interval: 1000, snapshot interval: 1000\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: Dense network trainable: True\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: Sparse embedding sparse_embedding1 trainable: True\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: Sparse embedding sparse_embedding2 trainable: True\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: Use mixed precision: False, scaler: 1.000000, use cuda graph: True\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: lr: 0.001000, warmup_steps: 1, end_lr: 0.000000\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: decay_start: 0, decay_steps: 1, decay_power: 2.000000\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: Training source file: ./multi_hot_parquet/file_list.txt\n", - "[HCTR][09:00:43.977][INFO][RK0][main]: Evaluation source file: ./multi_hot_parquet/file_list_test.txt\n", - "[HCTR][09:00:46.346][INFO][RK0][main]: Iter: 200 Time(200 iters): 2.36888s Loss: 0.346413 lr:0.001\n", - "[HCTR][09:00:48.421][INFO][RK0][main]: Iter: 400 Time(200 iters): 2.07362s Loss: 0.345891 lr:0.001\n", - "[HCTR][09:00:50.519][INFO][RK0][main]: Iter: 600 Time(200 iters): 2.09809s Loss: 0.345239 lr:0.001\n", - "[HCTR][09:00:52.586][INFO][RK0][main]: Iter: 800 Time(200 iters): 2.06616s Loss: 0.344346 lr:0.001\n", - "[HCTR][09:00:54.656][INFO][RK0][main]: Iter: 1000 Time(200 iters): 2.0697s Loss: 0.343731 lr:0.001\n", - "[HCTR][09:00:54.686][INFO][RK0][main]: Evaluation, AUC: 0.499013\n", - "[HCTR][09:00:54.686][INFO][RK0][main]: Eval Time for 1 iters: 0.006811s\n", - "[HCTR][09:00:54.692][INFO][RK0][main]: Rank0: Write hash table to file\n", - "[HCTR][09:00:54.830][INFO][RK0][main]: Rank0: Write hash table to file\n", - "[HCTR][09:00:54.848][INFO][RK0][main]: Dumping sparse weights to files, successful\n", - "[HCTR][09:00:54.851][INFO][RK0][main]: Rank0: Write optimzer state to file\n", - "[HCTR][09:00:54.852][INFO][RK0][main]: Done\n", - "[HCTR][09:00:54.852][INFO][RK0][main]: Rank0: Write optimzer state to file\n", - "[HCTR][09:00:54.853][INFO][RK0][main]: Done\n", - "[HCTR][09:00:54.886][INFO][RK0][main]: Rank0: Write optimzer state to file\n", - "[HCTR][09:00:54.887][INFO][RK0][main]: Done\n", - "[HCTR][09:00:54.887][INFO][RK0][main]: Rank0: Write optimzer state to file\n", - "[HCTR][09:00:54.888][INFO][RK0][main]: Done\n", - "[HCTR][09:00:54.904][INFO][RK0][main]: Dumping sparse optimzer states to files, successful\n", - "[HCTR][09:00:54.906][INFO][RK0][main]: Dumping dense weights to file, successful\n", - "[HCTR][09:00:54.909][INFO][RK0][main]: Dumping dense optimizer states to file, successful\n", - "[HCTR][09:00:55.915][INFO][RK0][main]: Finish 1100 iterations with batchsize: 16384 in 11.94s.\n" - ] - } - ], - "source": [ - "!python3 multi_hot_train.py" - ] - }, - { - "cell_type": "markdown", - "id": "0bb3b86c", - "metadata": {}, - "source": [ - "### Multi-GPU Offline Inference\n", - "\n", - "We can demonstrate multi-GPU offline inference by performing the following steps with Python APIs:\n", - "\n", - "1. Configure the inference hyperparameters.\n", - "2. Initialize the inference model. The model is a collection of inference sessions deployed on multiple devices.\n", - "3. Make an inference from the evaluation dataset.\n", - "4. Check the correctness of the inference by comparing it with the dumped evaluation results.\n", - "\n", - "**Note**: The `max_batchsize` configured within `InferenceParams` is the global batch size.\n", - "The value for `max_batchsize` should be divisible by the number of deployed devices.\n", - "The numpy array returned by `InferenceModel.predict` is of the shape `(max_batchsize * num_batches, label_dim)`." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "8e25d216", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[HCTR][09:01:06.069][WARNING][RK0][main]: default_value_for_each_table.size() is not equal to the number of embedding tables\n", - "[HCTR][09:01:06.072][INFO][RK0][main]: Global seed is 3072588155\n", - "[HCTR][09:01:06.222][INFO][RK0][main]: Device to NUMA mapping:\n", - " GPU 0 -> node 0\n", - " GPU 1 -> node 0\n", - " GPU 2 -> node 0\n", - " GPU 3 -> node 0\n", - " GPU 4 -> node 1\n", - " GPU 5 -> node 1\n", - " GPU 6 -> node 1\n", - " GPU 7 -> node 1\n", - "[HCTR][09:01:23.761][WARNING][RK0][main]: Peer-to-peer access cannot be fully enabled.\n", - "[HCTR][09:01:23.763][INFO][RK0][main]: Start all2all warmup\n", - "[HCTR][09:01:23.996][INFO][RK0][main]: End all2all warmup\n", - "[HCTR][09:01:24.013][INFO][RK0][main]: default_emb_vec_value is not specified using default: 0\n", - "[HCTR][09:01:24.013][INFO][RK0][main]: default_emb_vec_value is not specified using default: 0\n", - "[HCTR][09:01:24.013][INFO][RK0][main]: Creating HashMap CPU database backend...\n", - "[HCTR][09:01:24.013][INFO][RK0][main]: Volatile DB: initial cache rate = 1\n", - "[HCTR][09:01:24.013][INFO][RK0][main]: Volatile DB: cache missed embeddings = 0\n", - "[HCTR][09:01:24.347][INFO][RK0][main]: Table: hps_et.multi_hot.sparse_embedding1; cached 19849 / 19849 embeddings in volatile database (PreallocatedHashMapBackend); load: 19849 / 18446744073709551615 (0.00%).\n", - "[HCTR][09:01:24.622][INFO][RK0][main]: Table: hps_et.multi_hot.sparse_embedding2; cached 9996 / 9996 embeddings in volatile database (PreallocatedHashMapBackend); load: 9996 / 18446744073709551615 (0.00%).\n", - "[HCTR][09:01:24.622][DEBUG][RK0][main]: Real-time subscribers created!\n", - "[HCTR][09:01:24.622][INFO][RK0][main]: Create embedding cache in device 0.\n", - "[HCTR][09:01:24.628][INFO][RK0][main]: Use GPU embedding cache: True, cache size percentage: 0.500000\n", - "[HCTR][09:01:24.628][INFO][RK0][main]: Configured cache hit rate threshold: 1.000000\n", - "[HCTR][09:01:24.628][INFO][RK0][main]: The size of thread pool: 80\n", - "[HCTR][09:01:24.628][INFO][RK0][main]: The size of worker memory pool: 2\n", - "[HCTR][09:01:24.628][INFO][RK0][main]: The size of refresh memory pool: 1\n", - "[HCTR][09:01:24.641][INFO][RK0][main]: Create embedding cache in device 1.\n", - "[HCTR][09:01:24.646][INFO][RK0][main]: Use GPU embedding cache: True, cache size percentage: 0.500000\n", - "[HCTR][09:01:24.646][INFO][RK0][main]: Configured cache hit rate threshold: 1.000000\n", - "[HCTR][09:01:24.646][INFO][RK0][main]: The size of thread pool: 80\n", - "[HCTR][09:01:24.646][INFO][RK0][main]: The size of worker memory pool: 2\n", - "[HCTR][09:01:24.646][INFO][RK0][main]: The size of refresh memory pool: 1\n", - "[HCTR][09:01:24.647][INFO][RK0][main]: Create embedding cache in device 2.\n", - "[HCTR][09:01:24.652][INFO][RK0][main]: Use GPU embedding cache: True, cache size percentage: 0.500000\n", - "[HCTR][09:01:24.652][INFO][RK0][main]: Configured cache hit rate threshold: 1.000000\n", - "[HCTR][09:01:24.652][INFO][RK0][main]: The size of thread pool: 80\n", - "[HCTR][09:01:24.652][INFO][RK0][main]: The size of worker memory pool: 2\n", - "[HCTR][09:01:24.652][INFO][RK0][main]: The size of refresh memory pool: 1\n", - "[HCTR][09:01:24.654][INFO][RK0][main]: Create embedding cache in device 3.\n", - "[HCTR][09:01:24.659][INFO][RK0][main]: Use GPU embedding cache: True, cache size percentage: 0.500000\n", - "[HCTR][09:01:24.659][INFO][RK0][main]: Configured cache hit rate threshold: 1.000000\n", - "[HCTR][09:01:24.659][INFO][RK0][main]: The size of thread pool: 80\n", - "[HCTR][09:01:24.659][INFO][RK0][main]: The size of worker memory pool: 2\n", - "[HCTR][09:01:24.659][INFO][RK0][main]: The size of refresh memory pool: 1\n", - "[HCTR][09:01:24.662][INFO][RK0][main]: Create embedding cache in device 4.\n", - "[HCTR][09:01:24.667][INFO][RK0][main]: Use GPU embedding cache: True, cache size percentage: 0.500000\n", - "[HCTR][09:01:24.667][INFO][RK0][main]: Configured cache hit rate threshold: 1.000000\n", - "[HCTR][09:01:24.667][INFO][RK0][main]: The size of thread pool: 80\n", - "[HCTR][09:01:24.667][INFO][RK0][main]: The size of worker memory pool: 2\n", - "[HCTR][09:01:24.667][INFO][RK0][main]: The size of refresh memory pool: 1\n", - "[HCTR][09:01:24.669][INFO][RK0][main]: Create embedding cache in device 5.\n", - "[HCTR][09:01:24.675][INFO][RK0][main]: Use GPU embedding cache: True, cache size percentage: 0.500000\n", - "[HCTR][09:01:24.675][INFO][RK0][main]: Configured cache hit rate threshold: 1.000000\n", - "[HCTR][09:01:24.675][INFO][RK0][main]: The size of thread pool: 80\n", - "[HCTR][09:01:24.675][INFO][RK0][main]: The size of worker memory pool: 2\n", - "[HCTR][09:01:24.675][INFO][RK0][main]: The size of refresh memory pool: 1\n", - "[HCTR][09:01:24.679][INFO][RK0][main]: Create embedding cache in device 6.\n", - "[HCTR][09:01:24.683][INFO][RK0][main]: Use GPU embedding cache: True, cache size percentage: 0.500000\n", - "[HCTR][09:01:24.683][INFO][RK0][main]: Configured cache hit rate threshold: 1.000000\n", - "[HCTR][09:01:24.683][INFO][RK0][main]: The size of thread pool: 80\n", - "[HCTR][09:01:24.683][INFO][RK0][main]: The size of worker memory pool: 2\n", - "[HCTR][09:01:24.683][INFO][RK0][main]: The size of refresh memory pool: 1\n", - "[HCTR][09:01:24.685][INFO][RK0][main]: Create embedding cache in device 7.\n", - "[HCTR][09:01:24.688][INFO][RK0][main]: Use GPU embedding cache: True, cache size percentage: 0.500000\n", - "[HCTR][09:01:24.688][INFO][RK0][main]: Configured cache hit rate threshold: 1.000000\n", - "[HCTR][09:01:24.688][INFO][RK0][main]: The size of thread pool: 80\n", - "[HCTR][09:01:24.688][INFO][RK0][main]: The size of worker memory pool: 2\n", - "[HCTR][09:01:24.688][INFO][RK0][main]: The size of refresh memory pool: 1\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: Create inference session on device: 0\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: Model name: multi_hot\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: Use mixed precision: False\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: Use cuda graph: True\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: Max batchsize: 2048\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: Use I64 input key: True\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: start create embedding for inference\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: sparse_input name data1\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: sparse_input name data2\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: create embedding for inference success\n", - "[HCTR][09:01:24.768][INFO][RK0][main]: Inference stage skip MultiCrossEntropyLoss layer, replaced by Sigmoid layer\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: Create inference session on device: 1\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: Model name: multi_hot\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: Use mixed precision: False\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: Use cuda graph: True\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: Max batchsize: 2048\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: Use I64 input key: True\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: start create embedding for inference\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: sparse_input name data1\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: sparse_input name data2\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: create embedding for inference success\n", - "[HCTR][09:01:25.520][INFO][RK0][main]: Inference stage skip MultiCrossEntropyLoss layer, replaced by Sigmoid layer\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: Create inference session on device: 2\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: Model name: multi_hot\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: Use mixed precision: False\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: Use cuda graph: True\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: Max batchsize: 2048\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: Use I64 input key: True\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: start create embedding for inference\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: sparse_input name data1\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: sparse_input name data2\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: create embedding for inference success\n", - "[HCTR][09:01:26.275][INFO][RK0][main]: Inference stage skip MultiCrossEntropyLoss layer, replaced by Sigmoid layer\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: Create inference session on device: 3\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: Model name: multi_hot\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: Use mixed precision: False\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: Use cuda graph: True\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: Max batchsize: 2048\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: Use I64 input key: True\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: start create embedding for inference\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: sparse_input name data1\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: sparse_input name data2\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: create embedding for inference success\n", - "[HCTR][09:01:27.035][INFO][RK0][main]: Inference stage skip MultiCrossEntropyLoss layer, replaced by Sigmoid layer\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: Create inference session on device: 4\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: Model name: multi_hot\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: Use mixed precision: False\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: Use cuda graph: True\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: Max batchsize: 2048\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: Use I64 input key: True\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: start create embedding for inference\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: sparse_input name data1\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: sparse_input name data2\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: create embedding for inference success\n", - "[HCTR][09:01:27.781][INFO][RK0][main]: Inference stage skip MultiCrossEntropyLoss layer, replaced by Sigmoid layer\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: Create inference session on device: 5\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: Model name: multi_hot\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: Use mixed precision: False\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: Use cuda graph: True\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: Max batchsize: 2048\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: Use I64 input key: True\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: start create embedding for inference\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: sparse_input name data1\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: sparse_input name data2\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: create embedding for inference success\n", - "[HCTR][09:01:28.534][INFO][RK0][main]: Inference stage skip MultiCrossEntropyLoss layer, replaced by Sigmoid layer\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: Create inference session on device: 6\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: Model name: multi_hot\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: Use mixed precision: False\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: Use cuda graph: True\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: Max batchsize: 2048\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: Use I64 input key: True\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: start create embedding for inference\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: sparse_input name data1\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: sparse_input name data2\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: create embedding for inference success\n", - "[HCTR][09:01:29.291][INFO][RK0][main]: Inference stage skip MultiCrossEntropyLoss layer, replaced by Sigmoid layer\n", - "[HCTR][09:01:30.037][INFO][RK0][main]: Create inference session on device: 7\n", - "[HCTR][09:01:30.037][INFO][RK0][main]: Model name: multi_hot\n", - "[HCTR][09:01:30.037][INFO][RK0][main]: Use mixed precision: False\n", - "[HCTR][09:01:30.037][INFO][RK0][main]: Use cuda graph: True\n", - "[HCTR][09:01:30.037][INFO][RK0][main]: Max batchsize: 2048\n", - "[HCTR][09:01:30.037][INFO][RK0][main]: Use I64 input key: True\n", - "[HCTR][09:01:30.038][INFO][RK0][main]: start create embedding for inference\n", - "[HCTR][09:01:30.038][INFO][RK0][main]: sparse_input name data1\n", - "[HCTR][09:01:30.038][INFO][RK0][main]: sparse_input name data2\n", - "[HCTR][09:01:30.038][INFO][RK0][main]: create embedding for inference success\n", - "[HCTR][09:01:30.038][INFO][RK0][main]: Inference stage skip MultiCrossEntropyLoss layer, replaced by Sigmoid layer\n", - "[HCTR][09:01:30.807][INFO][RK0][main]: Create inference data reader on 8 GPU(s)\n", - "[HCTR][09:01:30.807][INFO][RK0][main]: num of DataReader workers: 8\n", - "[HCTR][09:01:30.915][INFO][RK0][main]: Vocabulary size: 30000\n", - "\n", - "[INFO] Inference time for 8 batches: 0.182527\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "pred: [[0.51329887 0.4888402 ]\n", - " [0.55268604 0.62567735]\n", - " [0.48302165 0.5015869 ]\n", - " ...\n", - " [0.52275413 0.46319592]\n", - " [0.46984023 0.5436093 ]\n", - " [0.48216432 0.48920953]]\n", - "grount_truth: [0.513299 0.48884 0.552686 ... 0.543609 0.482164 0.48921 ]\n", - "mse: 8.482603947165404e-14\n" - ] - } - ], - "source": [ - "import hugectr\n", - "from hugectr.inference import InferenceModel, InferenceParams\n", - "import numpy as np\n", - "from mpi4py import MPI\n", - "\n", - "model_config = \"multi_hot.json\"\n", - "inference_params = InferenceParams(\n", - " model_name = \"multi_hot\",\n", - " max_batchsize = 16384,\n", - " hit_rate_threshold = 1.0,\n", - " dense_model_file = \"multi_hot_dense_1000.model\",\n", - " sparse_model_files = [\"multi_hot0_sparse_1000.model\", \"multi_hot1_sparse_1000.model\"],\n", - " deployed_devices = [0, 1, 2, 3, 4, 5, 6, 7],\n", - " use_gpu_embedding_cache = True,\n", - " cache_size_percentage = 0.5,\n", - " i64_input_key = True\n", - ")\n", - "inference_model = InferenceModel(model_config, inference_params)\n", - "pred = inference_model.predict(\n", - " 8,\n", - " \"./multi_hot_parquet/file_list_test.txt\",\n", - " hugectr.DataReaderType_t.Parquet,\n", - " hugectr.Check_t.Non,\n", - " [10000, 10000, 10000]\n", - ")\n", - "grount_truth = np.loadtxt(\"multi_hot_pred_1000\")\n", - "print(\"pred: \", pred)\n", - "print(\"grount_truth: \", grount_truth)\n", - "diff = pred.flatten()-grount_truth\n", - "mse = np.mean(diff*diff)\n", - "print(\"mse: \", mse)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/test/inference/inference_model/cross_entropy_loss.py b/test/inference/inference_model/cross_entropy_loss.py deleted file mode 100644 index 9f0519f5cd..0000000000 --- a/test/inference/inference_model/cross_entropy_loss.py +++ /dev/null @@ -1,187 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -solver = hugectr.CreateSolver( - max_eval_batches=1, - batchsize_eval=1024, - batchsize=1024, - lr=0.01, - end_lr=0.0001, - warmup_steps=8000, - decay_start=48000, - decay_steps=24000, - vvgpu=[[0]], - repeat_dataset=True, - i64_input_key=True, -) -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Parquet, - source=["./cross/data/train/_file_list.txt"], - eval_source="./cross/data/test/_file_list.txt", - check_type=hugectr.Check_t.Sum, - slot_size_array=[10001, 10001, 10001, 10001], -) -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.Adam, - update_type=hugectr.Update_t.Local, - beta1=0.9, - beta2=0.999, - epsilon=0.0000001, -) -model = hugectr.Model(solver, reader, optimizer) -num_gpus = 1 -workspace_size_per_gpu_in_mb = int(40004 * 16 * 4 * 3 / 1000000) + 10 -model.add( - hugectr.Input( - label_dim=2, - label_name="label", - dense_dim=3, - dense_name="dense", - data_reader_sparse_param_array=[ - hugectr.DataReaderSparseParam( - "data1", - [1, 1, 1, 1], - False, - 4, - ) - ], - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.LocalizedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=workspace_size_per_gpu_in_mb, - embedding_vec_size=16, - combiner="mean", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dense"], - top_names=["fc1"], - num_output=16, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, - bottom_names=["fc1"], - top_names=["relu1"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Interaction, - bottom_names=["relu1", "sparse_embedding1"], - top_names=["interaction1"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["interaction1"], - top_names=["fc4"], - num_output=32, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, - bottom_names=["fc4"], - top_names=["relu4"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["relu4"], - top_names=["fc8"], - num_output=2, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.CrossEntropyLoss, - bottom_names=["fc8", "label"], - top_names=["loss"], - ) -) -model.compile() -model.summary() -model.graph_to_json(graph_config_file="/dump_infer/cross_entropy_loss.json") -model.fit( - max_iter=1001, - display=100, - eval_interval=1000, - snapshot=1000, - snapshot_prefix="/dump_infer/cross_entropy_loss", -) -model.export_predictions( - "/dump_infer/cross_entropy_loss_pred_" + str(1000), - "/dump_infer/cross_entropy_loss_label_" + str(1000), -) - - -from hugectr.inference import InferenceModel, InferenceParams -import numpy as np - -batch_size = 1024 -num_batches = 1 -inference_params = InferenceParams( - model_name="cross_entropy_loss", - max_batchsize=batch_size, - hit_rate_threshold=1.0, - dense_model_file="/dump_infer/cross_entropy_loss_dense_1000.model", - sparse_model_files=["/dump_infer/cross_entropy_loss0_sparse_1000.model"], - device_id=0, - use_gpu_embedding_cache=True, - cache_size_percentage=0.5, - use_mixed_precision=True, - i64_input_key=True, -) - -inference_model = InferenceModel("/dump_infer/cross_entropy_loss.json", inference_params) - -preds = inference_model.predict( - num_batches=num_batches, - source="./cross/data/test/_file_list.txt", - data_reader_type=hugectr.DataReaderType_t.Parquet, - check_type=hugectr.Check_t.Sum, - slot_size_array=[10001, 10001, 10001, 10001], -) - -ground_truth = np.loadtxt("/dump_infer/cross_entropy_loss_pred_1000") -predictions = preds.flatten() -diff = predictions - ground_truth -mse = np.mean(diff * diff) -if mse > 1e-3: - raise RuntimeError( - "Too large mse between cross_entropy_loss inference and training: {}".format(mse) - ) - sys.exit(1) -else: - print( - "cross_entropy_loss inference results are consistent with those during training, mse: {}".format( - mse - ) - ) diff --git a/test/inference/inference_model/dcn_multi_hot.py b/test/inference/inference_model/dcn_multi_hot.py deleted file mode 100644 index eaac8c19f1..0000000000 --- a/test/inference/inference_model/dcn_multi_hot.py +++ /dev/null @@ -1,203 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -solver = hugectr.CreateSolver( - model_name="dcn", - max_eval_batches=1, - batchsize_eval=16384, - batchsize=16384, - lr=0.001, - vvgpu=[[0]], - repeat_dataset=True, - use_mixed_precision=False, - scaler=1.0, - use_cuda_graph=True, - metrics_spec={hugectr.MetricsType.AUC: 1.0}, -) -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Norm, - source=["./dcn_data/file_list.txt"], - eval_source="./dcn_data/file_list_test.txt", - check_type=hugectr.Check_t.Sum, - num_workers=16, -) -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.Adam, - update_type=hugectr.Update_t.Global, - beta1=0.9, - beta2=0.999, - epsilon=0.0001, -) -model = hugectr.Model(solver, reader, optimizer) -model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 2, False, 26)], - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=300, - embedding_vec_size=16, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding1"], - top_names=["reshape1"], - leading_dim=416, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, bottom_names=["reshape1", "dense"], top_names=["concat1"] - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Slice, - bottom_names=["concat1"], - top_names=["slice11", "slice12"], - ranges=[(0, 429), (0, 429)], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MultiCross, - bottom_names=["slice11"], - top_names=["multicross1"], - num_layers=1, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["slice12"], - top_names=["fc1"], - num_output=1024, - ) -) -model.add( - hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc1"], top_names=["relu1"]) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu1"], - top_names=["dropout1"], - dropout_rate=0.5, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout1"], - top_names=["fc2"], - num_output=1024, - ) -) -model.add( - hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc2"], top_names=["relu2"]) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu2"], - top_names=["dropout2"], - dropout_rate=0.5, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, - bottom_names=["dropout2", "multicross1"], - top_names=["concat2"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat2"], - top_names=["fc3"], - num_output=1, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["fc3", "label"], - top_names=["loss"], - ) -) -model.compile() -model.summary() -model.graph_to_json(graph_config_file="/dump_infer/dcn.json") -model.fit( - max_iter=2300, display=200, eval_interval=2000, snapshot=2000, snapshot_prefix="/dump_infer/dcn" -) -model.export_predictions("/dump_infer/dcn_pred_" + str(2000), "/dump_infer/dcn_label_" + str(2000)) - - -from hugectr.inference import InferenceModel, InferenceParams -import numpy as np - -batch_size = 16384 -num_batches = 1 -data_source = "./dcn_data/file_list_test.txt" -inference_params = InferenceParams( - model_name="dcn", - max_batchsize=batch_size, - hit_rate_threshold=1.0, - dense_model_file="/dump_infer/dcn_dense_2000.model", - sparse_model_files=["/dump_infer/dcn0_sparse_2000.model"], - device_id=0, - use_gpu_embedding_cache=False, - cache_size_percentage=1.0, - i64_input_key=False, - use_mixed_precision=False, - use_cuda_graph=True, -) -inference_model = InferenceModel("/dump_infer/dcn.json", inference_params) -predictions = inference_model.predict( - num_batches=num_batches, - source=data_source, - data_reader_type=hugectr.DataReaderType_t.Norm, - check_type=hugectr.Check_t.Sum, -) -grount_truth = np.loadtxt("/dump_infer/dcn_pred_2000") -diff = predictions.flatten() - grount_truth -mse = np.mean(diff * diff) -if mse > 1e-3: - raise RuntimeError("Too large mse between DCN multi hot inference and training: {}".format(mse)) - sys.exit(1) -else: - print( - "DCN multi hot inference results are consistent with those during training, mse: {}".format( - mse - ) - ) diff --git a/test/inference/inference_model/dcn_one_hot.py b/test/inference/inference_model/dcn_one_hot.py deleted file mode 100644 index 8f89899e12..0000000000 --- a/test/inference/inference_model/dcn_one_hot.py +++ /dev/null @@ -1,203 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -solver = hugectr.CreateSolver( - model_name="dcn", - max_eval_batches=1, - batchsize_eval=16384, - batchsize=16384, - lr=0.001, - vvgpu=[[0]], - repeat_dataset=True, - use_mixed_precision=False, - scaler=1.0, - use_cuda_graph=True, - metrics_spec={hugectr.MetricsType.AUC: 1.0}, -) -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Norm, - source=["./dcn_data/file_list.txt"], - eval_source="./dcn_data/file_list_test.txt", - check_type=hugectr.Check_t.Sum, - num_workers=16, -) -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.Adam, - update_type=hugectr.Update_t.Global, - beta1=0.9, - beta2=0.999, - epsilon=0.0001, -) -model = hugectr.Model(solver, reader, optimizer) -model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 1, True, 26)], - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=300, - embedding_vec_size=16, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding1"], - top_names=["reshape1"], - leading_dim=416, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, bottom_names=["reshape1", "dense"], top_names=["concat1"] - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Slice, - bottom_names=["concat1"], - top_names=["slice11", "slice12"], - ranges=[(0, 429), (0, 429)], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MultiCross, - bottom_names=["slice11"], - top_names=["multicross1"], - num_layers=1, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["slice12"], - top_names=["fc1"], - num_output=1024, - ) -) -model.add( - hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc1"], top_names=["relu1"]) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu1"], - top_names=["dropout1"], - dropout_rate=0.5, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout1"], - top_names=["fc2"], - num_output=1024, - ) -) -model.add( - hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc2"], top_names=["relu2"]) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu2"], - top_names=["dropout2"], - dropout_rate=0.5, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, - bottom_names=["dropout2", "multicross1"], - top_names=["concat2"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat2"], - top_names=["fc3"], - num_output=1, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["fc3", "label"], - top_names=["loss"], - ) -) -model.compile() -model.summary() -model.graph_to_json(graph_config_file="/dump_infer/dcn.json") -model.fit( - max_iter=2300, display=200, eval_interval=2000, snapshot=2000, snapshot_prefix="/dump_infer/dcn" -) -model.export_predictions("/dump_infer/dcn_pred_" + str(2000), "/dump_infer/dcn_label_" + str(2000)) - - -from hugectr.inference import InferenceModel, InferenceParams -import numpy as np - -batch_size = 16384 -num_batches = 1 -data_source = "./dcn_data/file_list_test.txt" -inference_params = InferenceParams( - model_name="dcn", - max_batchsize=batch_size, - hit_rate_threshold=1.0, - dense_model_file="/dump_infer/dcn_dense_2000.model", - sparse_model_files=["/dump_infer/dcn0_sparse_2000.model"], - device_id=0, - use_gpu_embedding_cache=False, - cache_size_percentage=1.0, - i64_input_key=False, - use_mixed_precision=False, - use_cuda_graph=True, -) -inference_model = InferenceModel("/dump_infer/dcn.json", inference_params) -predictions = inference_model.predict( - num_batches=num_batches, - source=data_source, - data_reader_type=hugectr.DataReaderType_t.Norm, - check_type=hugectr.Check_t.Sum, -) -grount_truth = np.loadtxt("/dump_infer/dcn_pred_2000") -diff = predictions.flatten() - grount_truth -mse = np.mean(diff * diff) -if mse > 1e-3: - raise RuntimeError("Too large mse between DCN one hot inference and training: {}".format(mse)) - sys.exit(1) -else: - print( - "DCN one hot inference results are consistent with those during training, mse: {}".format( - mse - ) - ) diff --git a/test/inference/inference_model/dlrm_mlp_one_hot.py b/test/inference/inference_model/dlrm_mlp_one_hot.py deleted file mode 100644 index a120bf1af4..0000000000 --- a/test/inference/inference_model/dlrm_mlp_one_hot.py +++ /dev/null @@ -1,158 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -solver = hugectr.CreateSolver( - model_name="dcn", - max_eval_batches=1, - batchsize_eval=16384, - batchsize=16384, - lr=0.001, - vvgpu=[[0]], - repeat_dataset=True, - use_mixed_precision=False, - scaler=1.0, - use_cuda_graph=True, - metrics_spec={hugectr.MetricsType.AUC: 1.0}, -) -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Norm, - source=["./dcn_data/file_list.txt"], - eval_source="./dcn_data/file_list_test.txt", - check_type=hugectr.Check_t.Sum, - num_workers=16, -) -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.Adam, - update_type=hugectr.Update_t.Global, - beta1=0.9, - beta2=0.999, - epsilon=0.0001, -) -model = hugectr.Model(solver, reader, optimizer) -model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 1, True, 26)], - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=2400, - embedding_vec_size=128, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MLP, - bottom_names=["dense"], - top_names=["mlp1"], - num_outputs=[512, 256, 128], - act_type=hugectr.Activation_t.Relu, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Interaction, - bottom_names=["mlp1", "sparse_embedding1"], - top_names=["interaction1"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MLP, - bottom_names=["interaction1"], - top_names=["mlp2"], - num_outputs=[1024, 1024, 512, 256, 1], - activations=[ - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Non, - ], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["mlp2", "label"], - top_names=["loss"], - ) -) -model.compile() -model.summary() -model.graph_to_json(graph_config_file="/dump_infer/dlrm.json") -model.fit( - max_iter=2300, - display=200, - eval_interval=2000, - snapshot=2000, - snapshot_prefix="/dump_infer/dlrm", -) -model.export_predictions( - "/dump_infer/dlrm_pred_" + str(2000), "/dump_infer/dlrm_label_" + str(2000) -) - - -from hugectr.inference import InferenceModel, InferenceParams -import numpy as np - -batch_size = 16384 -num_batches = 1 -data_source = "./dcn_data/file_list_test.txt" -inference_params = InferenceParams( - model_name="dlrm", - max_batchsize=batch_size, - hit_rate_threshold=1.0, - dense_model_file="/dump_infer/dlrm_dense_2000.model", - sparse_model_files=["/dump_infer/dlrm0_sparse_2000.model"], - device_id=0, - use_gpu_embedding_cache=False, - cache_size_percentage=1.0, - i64_input_key=False, - use_mixed_precision=False, - use_cuda_graph=True, -) -inference_model = InferenceModel("/dump_infer/dlrm.json", inference_params) -predictions = inference_model.predict( - num_batches=num_batches, - source=data_source, - data_reader_type=hugectr.DataReaderType_t.Norm, - check_type=hugectr.Check_t.Sum, -) -grount_truth = np.loadtxt("/dump_infer/dlrm_pred_2000") -diff = predictions.flatten() - grount_truth -mse = np.mean(diff * diff) -if mse > 1e-3: - raise RuntimeError("Too large mse between DLRM one hot inference and training: {}".format(mse)) - sys.exit(1) -else: - print( - "DLRM one hot inference results are consistent with those during training, mse: {}".format( - mse - ) - ) diff --git a/test/inference/inference_model/mmoe_inference.py b/test/inference/inference_model/mmoe_inference.py deleted file mode 100644 index 7d5cc6b0c3..0000000000 --- a/test/inference/inference_model/mmoe_inference.py +++ /dev/null @@ -1,46 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -from hugectr.inference import InferenceParams, CreateInferenceSession -from hugectr.inference import InferenceModel, InferenceParams -import numpy as np - -batch_size = 16384 -num_batches = 1 -data_source = "./mmoe_data/file_names_val.txt" -inference_params = InferenceParams( - model_name="mmoe", - max_batchsize=batch_size, - hit_rate_threshold=1.0, - dense_model_file="./onnx_converter/hugectr_models/mmoe_dense_2000.model", - sparse_model_files=["./onnx_converter/hugectr_models/mmoe0_sparse_2000.model"], - device_id=0, - use_gpu_embedding_cache=False, - cache_size_percentage=1.0, - i64_input_key=False, - use_mixed_precision=False, - use_cuda_graph=True, -) -inference_model = InferenceModel("./onnx_converter/graph_files/mmoe.json", inference_params) -inf_predictions = inference_model.predict( - num_batches=num_batches, - source=data_source, - data_reader_type=hugectr.DataReaderType_t.Parquet, - check_type=hugectr.Check_t.Sum, -) diff --git a/test/inference/inference_model/multi_cross_entropy_loss.py b/test/inference/inference_model/multi_cross_entropy_loss.py deleted file mode 100644 index 1e2b3088ff..0000000000 --- a/test/inference/inference_model/multi_cross_entropy_loss.py +++ /dev/null @@ -1,191 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -solver = hugectr.CreateSolver( - max_eval_batches=1, - batchsize_eval=1024, - batchsize=1024, - lr=0.01, - end_lr=0.0001, - warmup_steps=8000, - decay_start=48000, - decay_steps=24000, - vvgpu=[[0]], - repeat_dataset=True, - i64_input_key=True, -) -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Parquet, - source=["./multi_cross/data/train/_file_list.txt"], - eval_source="./multi_cross/data/test/_file_list.txt", - check_type=hugectr.Check_t.Sum, - slot_size_array=[10001, 10001, 10001, 10001], -) -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.Adam, - update_type=hugectr.Update_t.Local, - beta1=0.9, - beta2=0.999, - epsilon=0.0000001, -) -model = hugectr.Model(solver, reader, optimizer) -num_gpus = 1 -workspace_size_per_gpu_in_mb = int(40004 * 16 * 4 * 3 / 1000000) + 10 -model.add( - hugectr.Input( - label_dim=3, - label_name="label", - dense_dim=3, - dense_name="dense", - data_reader_sparse_param_array=[ - hugectr.DataReaderSparseParam( - "data1", - [1, 1, 1, 1], - False, - 4, - ) - ], - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.LocalizedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=workspace_size_per_gpu_in_mb, - embedding_vec_size=16, - combiner="mean", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dense"], - top_names=["fc1"], - num_output=16, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, - bottom_names=["fc1"], - top_names=["relu1"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Interaction, - bottom_names=["relu1", "sparse_embedding1"], - top_names=["interaction1"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["interaction1"], - top_names=["fc4"], - num_output=32, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.ReLU, - bottom_names=["fc4"], - top_names=["relu4"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["relu4"], - top_names=["fc8"], - num_output=3, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MultiCrossEntropyLoss, - bottom_names=["fc8", "label"], - top_names=["loss"], - target_weight_vec=[0.2, 0.4, 0.4], - ) -) -model.compile() -model.summary() -model.graph_to_json(graph_config_file="/dump_infer/multi_cross_entropy_loss.json") - -model.fit( - max_iter=1001, - display=100, - eval_interval=1000, - snapshot=1000, - snapshot_prefix="/dump_infer/multi_cross_entropy_loss", -) - -model.export_predictions( - "/dump_infer/multi_cross_entropy_loss_pred_" + str(1000), - "/dump_infer/multi_cross_entropy_loss_label_" + str(1000), -) - - -from hugectr.inference import InferenceModel, InferenceParams -from mpi4py import MPI -import hugectr -import pandas as pd -import numpy as np - -inference_params = InferenceParams( - model_name="multi_cross_entropy_loss", - max_batchsize=1024, - hit_rate_threshold=1.0, - dense_model_file="/dump_infer/multi_cross_entropy_loss_dense_1000.model", - sparse_model_files=["/dump_infer/multi_cross_entropy_loss0_sparse_1000.model"], - device_id=0, - use_gpu_embedding_cache=True, - cache_size_percentage=0.5, - use_mixed_precision=False, - i64_input_key=True, -) - -inference_model = InferenceModel("/dump_infer/multi_cross_entropy_loss.json", inference_params) - -preds = inference_model.predict( - num_batches=1, - source="./multi_cross/data/test/_file_list.txt", - data_reader_type=hugectr.DataReaderType_t.Parquet, - check_type=hugectr.Check_t.Sum, - slot_size_array=[10001, 10001, 10001, 10001], -) - -ground_truth = np.loadtxt("/dump_infer/multi_cross_entropy_loss_pred_1000") -predictions = preds.flatten() -diff = predictions - ground_truth -mse = np.mean(diff * diff) -if mse > 1e-3: - raise RuntimeError( - "Too large mse between multi_cross_entropy_loss inference and training: {}".format(mse) - ) - sys.exit(1) -else: - print( - "multi_cross_entropy_loss inference results are consistent with those during training, mse: {}".format( - mse - ) - ) diff --git a/test/inference/inference_model/synthetic_multi_hot.py b/test/inference/inference_model/synthetic_multi_hot.py deleted file mode 100644 index 3a568e97d3..0000000000 --- a/test/inference/inference_model/synthetic_multi_hot.py +++ /dev/null @@ -1,195 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -solver = hugectr.CreateSolver( - model_name="multi_hot", - max_eval_batches=1, - batchsize_eval=16384, - batchsize=16384, - lr=0.001, - vvgpu=[[0, 1, 2, 3]], - i64_input_key=True, - repeat_dataset=True, - use_cuda_graph=True, -) -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Parquet, - source=["./multi_hot_parquet/file_list.txt"], - eval_source="./multi_hot_parquet/file_list_test.txt", - check_type=hugectr.Check_t.Non, - slot_size_array=[10000, 10000, 10000], -) -optimizer = hugectr.CreateOptimizer(optimizer_type=hugectr.Optimizer_t.Adam) -model = hugectr.Model(solver, reader, optimizer) -model.add( - hugectr.Input( - label_dim=2, - label_name="label", - dense_dim=2, - dense_name="dense", - data_reader_sparse_param_array=[ - hugectr.DataReaderSparseParam("data1", [2, 1], False, 2), - hugectr.DataReaderSparseParam("data2", 3, False, 1), - ], - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=100, - embedding_vec_size=16, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=100, - embedding_vec_size=16, - combiner="sum", - sparse_embedding_name="sparse_embedding2", - bottom_name="data2", - optimizer=optimizer, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding1"], - top_names=["reshape1"], - leading_dim=32, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding2"], - top_names=["reshape2"], - leading_dim=16, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, - bottom_names=["reshape1", "reshape2", "dense"], - top_names=["concat1"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat1"], - top_names=["fc1"], - num_output=1024, - ) -) -model.add( - hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc1"], top_names=["relu1"]) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["relu1"], - top_names=["fc2"], - num_output=2, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MultiCrossEntropyLoss, - bottom_names=["fc2", "label"], - top_names=["loss"], - target_weight_vec=[0.5, 0.5], - ) -) -model.compile() -model.summary() -model.graph_to_json("/dump_infer/multi_hot.json") -model.fit( - max_iter=1100, - display=200, - eval_interval=1000, - snapshot=1000, - snapshot_prefix="/dump_infer/multi_hot", -) -model.export_predictions( - "/dump_infer/multi_hot_pred_" + str(1000), "/dump_infer/multi_hot_label_" + str(1000) -) - -sparse_embedding1 = model.check_out_tensor("sparse_embedding1", hugectr.Tensor_t.Evaluate) -sparse_embedding2 = model.check_out_tensor("sparse_embedding2", hugectr.Tensor_t.Evaluate) - -import hugectr -from hugectr.inference import InferenceModel, InferenceParams -import numpy as np -from mpi4py import MPI - -model_config = "/dump_infer/multi_hot.json" -inference_params = InferenceParams( - model_name="multi_hot", - max_batchsize=16384, - hit_rate_threshold=1.0, - dense_model_file="/dump_infer/multi_hot_dense_1000.model", - sparse_model_files=[ - "/dump_infer/multi_hot0_sparse_1000.model", - "/dump_infer/multi_hot1_sparse_1000.model", - ], - deployed_devices=[0, 1, 2, 3, 4, 5, 6, 7], - use_gpu_embedding_cache=True, - cache_size_percentage=0.5, - i64_input_key=True, -) -inference_model = InferenceModel(model_config, inference_params) -pred = inference_model.predict( - 1, - "./multi_hot_parquet/file_list_test.txt", - hugectr.DataReaderType_t.Parquet, - hugectr.Check_t.Non, - [10000, 10000, 10000], -) -grount_truth = np.loadtxt("/dump_infer/multi_hot_pred_1000") -print("pred: ", pred) -print("grount_truth: ", grount_truth) -diff = pred.flatten() - grount_truth -mse = np.mean(diff * diff) -print("mse: ", mse) - -inference_sparse_embedding1 = inference_model.check_out_tensor("sparse_embedding1") -inference_sparse_embedding2 = inference_model.check_out_tensor("sparse_embedding2") -diff1 = sparse_embedding1.flatten() - inference_sparse_embedding1.flatten() -diff2 = sparse_embedding2.flatten() - inference_sparse_embedding2.flatten() -mse1 = np.mean(diff1 * diff1) -mse2 = np.mean(diff2 * diff2) - -if mse > 1e-3 or mse1 > 1e-3 or mse2 > 1e-3: - raise RuntimeError( - "Too large mse between synthetic multi hot inference and training: {}, {}, {}".format( - mse, mse1, mse2 - ) - ) - sys.exit(1) -else: - print( - "Synthetic multi hot inference results are consistent with those during training, mse: {}, {}, {}".format( - mse, mse1, mse2 - ) - ) diff --git a/test/inference/inference_model/wdl_multi_hot.py b/test/inference/inference_model/wdl_multi_hot.py deleted file mode 100644 index 327ee69853..0000000000 --- a/test/inference/inference_model/wdl_multi_hot.py +++ /dev/null @@ -1,202 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -solver = hugectr.CreateSolver( - model_name="wdl", - max_eval_batches=1, - batchsize_eval=16384, - batchsize=16384, - lr=0.001, - vvgpu=[[0]], - repeat_dataset=True, -) -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Norm, - source=["./wdl_data/file_list.txt"], - eval_source="./wdl_data/file_list_test.txt", - check_type=hugectr.Check_t.Sum, -) -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.Adam, - update_type=hugectr.Update_t.Global, - beta1=0.9, - beta2=0.999, - epsilon=0.0000001, -) -model = hugectr.Model(solver, reader, optimizer) -model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[ - hugectr.DataReaderSparseParam("wide_data", 30, False, 1), - hugectr.DataReaderSparseParam("deep_data", 2, False, 26), - ], - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=69, - embedding_vec_size=1, - combiner="sum", - sparse_embedding_name="sparse_embedding2", - bottom_name="wide_data", - optimizer=optimizer, - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=1074, - embedding_vec_size=16, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="deep_data", - optimizer=optimizer, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding1"], - top_names=["reshape1"], - leading_dim=416, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding2"], - top_names=["reshape2"], - leading_dim=1, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, bottom_names=["reshape1", "dense"], top_names=["concat1"] - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat1"], - top_names=["fc1"], - num_output=1024, - ) -) -model.add( - hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc1"], top_names=["relu1"]) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu1"], - top_names=["dropout1"], - dropout_rate=0.5, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout1"], - top_names=["fc2"], - num_output=1024, - ) -) -model.add( - hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc2"], top_names=["relu2"]) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu2"], - top_names=["dropout2"], - dropout_rate=0.5, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout2"], - top_names=["fc3"], - num_output=1, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Add, bottom_names=["fc3", "reshape2"], top_names=["add1"] - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["add1", "label"], - top_names=["loss"], - ) -) -model.compile() -model.summary() -model.graph_to_json(graph_config_file="/dump_infer/wdl.json") -model.fit( - max_iter=2300, display=200, eval_interval=2000, snapshot=2000, snapshot_prefix="/dump_infer/wdl" -) -model.export_predictions("/dump_infer/wdl_pred_" + str(2000), "/dump_infer/wdl_label_" + str(2000)) - - -from hugectr.inference import InferenceModel, InferenceParams -import numpy as np - -batch_size = 16384 -num_batches = 1 -data_source = "./wdl_data/file_list_test.txt" -inference_params = InferenceParams( - model_name="wdl", - max_batchsize=batch_size, - hit_rate_threshold=1.0, - dense_model_file="/dump_infer/wdl_dense_2000.model", - sparse_model_files=["/dump_infer/wdl0_sparse_2000.model", "/dump_infer/wdl1_sparse_2000.model"], - device_id=0, - use_gpu_embedding_cache=False, - cache_size_percentage=1.0, - i64_input_key=False, - use_mixed_precision=True, - use_cuda_graph=True, -) -inference_model = InferenceModel("/dump_infer/wdl.json", inference_params) -predictions = inference_model.predict( - num_batches=num_batches, - source=data_source, - data_reader_type=hugectr.DataReaderType_t.Norm, - check_type=hugectr.Check_t.Sum, -) -grount_truth = np.loadtxt("/dump_infer/wdl_pred_2000") -diff = predictions.flatten() - grount_truth -mse = np.mean(diff * diff) -if mse > 1e-3: - raise RuntimeError("Too large mse between WDL multi hot inference and training: {}".format(mse)) - sys.exit(1) -else: - print( - "WDL multi hot inference results are consistent with those during training, mse: {}".format( - mse - ) - ) diff --git a/test/inference/inference_model/wdl_one_hot.py b/test/inference/inference_model/wdl_one_hot.py deleted file mode 100644 index 59de95b0e3..0000000000 --- a/test/inference/inference_model/wdl_one_hot.py +++ /dev/null @@ -1,202 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -solver = hugectr.CreateSolver( - model_name="wdl", - max_eval_batches=1, - batchsize_eval=16384, - batchsize=16384, - lr=0.001, - vvgpu=[[0]], - repeat_dataset=True, -) -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.Norm, - source=["./wdl_data/file_list.txt"], - eval_source="./wdl_data/file_list_test.txt", - check_type=hugectr.Check_t.Sum, -) -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.Adam, - update_type=hugectr.Update_t.Global, - beta1=0.9, - beta2=0.999, - epsilon=0.0000001, -) -model = hugectr.Model(solver, reader, optimizer) -model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[ - hugectr.DataReaderSparseParam("wide_data", 2, True, 1), - hugectr.DataReaderSparseParam("deep_data", 1, True, 26), - ], - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=69, - embedding_vec_size=1, - combiner="sum", - sparse_embedding_name="sparse_embedding2", - bottom_name="wide_data", - optimizer=optimizer, - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, - workspace_size_per_gpu_in_mb=1074, - embedding_vec_size=16, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="deep_data", - optimizer=optimizer, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding1"], - top_names=["reshape1"], - leading_dim=416, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Reshape, - bottom_names=["sparse_embedding2"], - top_names=["reshape2"], - leading_dim=1, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Concat, bottom_names=["reshape1", "dense"], top_names=["concat1"] - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["concat1"], - top_names=["fc1"], - num_output=1024, - ) -) -model.add( - hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc1"], top_names=["relu1"]) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu1"], - top_names=["dropout1"], - dropout_rate=0.5, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout1"], - top_names=["fc2"], - num_output=1024, - ) -) -model.add( - hugectr.DenseLayer(layer_type=hugectr.Layer_t.ReLU, bottom_names=["fc2"], top_names=["relu2"]) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Dropout, - bottom_names=["relu2"], - top_names=["dropout2"], - dropout_rate=0.5, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.InnerProduct, - bottom_names=["dropout2"], - top_names=["fc3"], - num_output=1, - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Add, bottom_names=["fc3", "reshape2"], top_names=["add1"] - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["add1", "label"], - top_names=["loss"], - ) -) -model.compile() -model.summary() -model.graph_to_json(graph_config_file="/dump_infer/wdl.json") -model.fit( - max_iter=2300, display=200, eval_interval=2000, snapshot=2000, snapshot_prefix="/dump_infer/wdl" -) -model.export_predictions("/dump_infer/wdl_pred_" + str(2000), "/dump_infer/wdl_label_" + str(2000)) - - -from hugectr.inference import InferenceModel, InferenceParams -import numpy as np - -batch_size = 16384 -num_batches = 1 -data_source = "./wdl_data/file_list_test.txt" -inference_params = InferenceParams( - model_name="wdl", - max_batchsize=batch_size, - hit_rate_threshold=1.0, - dense_model_file="/dump_infer/wdl_dense_2000.model", - sparse_model_files=["/dump_infer/wdl0_sparse_2000.model", "/dump_infer/wdl1_sparse_2000.model"], - device_id=0, - use_gpu_embedding_cache=False, - cache_size_percentage=1.0, - i64_input_key=False, - use_mixed_precision=True, - use_cuda_graph=True, -) -inference_model = InferenceModel("/dump_infer/wdl.json", inference_params) -predictions = inference_model.predict( - num_batches=num_batches, - source=data_source, - data_reader_type=hugectr.DataReaderType_t.Norm, - check_type=hugectr.Check_t.Sum, -) -grount_truth = np.loadtxt("/dump_infer/wdl_pred_2000") -diff = predictions.flatten() - grount_truth -mse = np.mean(diff * diff) -if mse > 1e-3: - raise RuntimeError("Too large mse between WDL one hot inference and training: {}".format(mse)) - sys.exit(1) -else: - print( - "WDL one hot inference results are consistent with those during training, mse: {}".format( - mse - ) - ) diff --git a/test/inference/inference_session/dcn_inference.py b/test/inference/inference_session/dcn_inference.py deleted file mode 100644 index 4c70bde5d0..0000000000 --- a/test/inference/inference_session/dcn_inference.py +++ /dev/null @@ -1,73 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import sys -from hugectr.inference import InferenceParams, CreateInferenceSession - - -def dcn_inference(config_file, model_name, data_path, use_gpu_embedding_cache): - # read data from file - data_file = open(data_path) - labels = [int(item) for item in data_file.readline().split(" ")] - dense_features = [float(item) for item in data_file.readline().split(" ")] - embedding_columns = [int(item) for item in data_file.readline().split(" ")] - row_ptrs = [int(item) for item in data_file.readline().split(" ")] - # create parameter server, embedding cache and inference session - inference_params = InferenceParams( - model_name=model_name, - max_batchsize=4096, - hit_rate_threshold=0.6, - dense_model_file="/hugectr/test/utest/_dense_10000.model", - sparse_model_files=["/hugectr/test/utest/0_sparse_10000.model"], - device_id=0, - use_gpu_embedding_cache=use_gpu_embedding_cache, - cache_size_percentage=0.9, - i64_input_key=False, - ) - inference_session = CreateInferenceSession(config_file, inference_params) - # make prediction and calculate accuracy - output = inference_session.predict(dense_features, embedding_columns, row_ptrs) - accuracy = calculate_accuracy(labels, output) - if use_gpu_embedding_cache: - print( - "[HUGECTR][INFO] Use gpu embedding cache, prediction number samples: {}, accuracy: {}".format( - len(labels), accuracy - ) - ) - else: - print( - "[HUGECTR][INFO] Use cpu parameter server, prediction number samples: {}, accuracy: {}".format( - len(labels), accuracy - ) - ) - - -def calculate_accuracy(labels, output): - num_samples = len(labels) - flags = [ - 1 if ((labels[i] == 0 and output[i] <= 0.5) or (labels[i] == 1 and output[i] > 0.5)) else 0 - for i in range(num_samples) - ] - correct_samples = sum(flags) - return float(correct_samples) / float(num_samples) - - -if __name__ == "__main__": - config_file = sys.argv[1] - model_name = sys.argv[2] - data_path = sys.argv[3] - dcn_inference(config_file, model_name, data_path, True) - dcn_inference(config_file, model_name, data_path, False) diff --git a/test/inference/inference_session/movielens_nodense_test.py b/test/inference/inference_session/movielens_nodense_test.py deleted file mode 100644 index 0d5a98fed2..0000000000 --- a/test/inference/inference_session/movielens_nodense_test.py +++ /dev/null @@ -1,110 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -from hugectr.inference import InferenceParams, CreateInferenceSession -import pandas as pd -import numpy as np -import sys - - -# from mpi4py import MPI -def movie_inference( - model_name, network_file, dense_file, embedding_file_list, data_file, enable_cache -): - CATEGORICAL_COLUMNS = ["userId", "movieId"] - LABEL_COLUMNS = ["rating"] - emb_size = [162542, 56586] - shift = np.insert(np.cumsum(emb_size), 0, 0)[:-1] - result = [ - 0.8336379528045654, - 0.24868586659431458, - 0.4039016664028168, - 0.9553083777427673, - 0.6617599725723267, - 0.5613522529602051, - 0.16344544291496277, - 0.537512481212616, - 0.5185080766677856, - 0.2947561740875244, - ] - - test_df = pd.read_parquet(data_file) - config_file = network_file - row_ptrs = list(range(0, 21)) - dense_features = [] - test_df[CATEGORICAL_COLUMNS].astype(np.int64) - embedding_columns = list((test_df.head(10)[CATEGORICAL_COLUMNS] + shift).values.flatten()) - - # create parameter server, embedding cache and inference session - inference_params = InferenceParams( - model_name=model_name, - max_batchsize=64, - hit_rate_threshold=1.0, - dense_model_file=dense_file, - sparse_model_files=embedding_file_list, - device_id=0, - use_gpu_embedding_cache=enable_cache, - cache_size_percentage=0.9, - i64_input_key=True, - use_mixed_precision=False, - ) - inference_session = CreateInferenceSession(config_file, inference_params) - output1 = inference_session.predict(dense_features, embedding_columns, row_ptrs) - miss1 = np.mean((np.array(output1) - np.array(result)) ** 2) - inference_session.refresh_embedding_cache() - output2 = inference_session.predict(dense_features, embedding_columns, row_ptrs) - miss2 = np.mean((np.array(output2) - np.array(result)) ** 2) - print("Movielens model(no dense input) inference result should be {}".format(result)) - miss = max(miss1, miss2) - if enable_cache: - if miss > 0.0001: - raise RuntimeError( - "Movielens model(no dense input) inference using GPU cache, prediction error is greater than threshold: {}, error is {}".format( - 0.0001, miss - ) - ) - sys.exit(1) - else: - print( - "[HUGECTR][INFO] Movielens model(no dense input) inference using GPU cache, prediction error is less than threshold:{}, error is {}".format( - 0.0001, miss - ) - ) - else: - if miss > 0.0001: - raise RuntimeError( - "[HUGECTR][INFO] Movielens model(no dense input) inference without GPU cache, prediction error is greater than threshold:{}, error is {}".format( - 0.0001, miss - ) - ) - sys.exit(1) - else: - print( - "[HUGECTR][INFO] Movielens model(no dense input) inference without GPU cache, prediction error is less than threshold: {}, error is {}".format( - 0.0001, miss - ) - ) - - -if __name__ == "__main__": - model_name = sys.argv[1] - network_file = sys.argv[2] - dense_file = sys.argv[3] - embedding_file_list = str(sys.argv[4]).split(",") - print(embedding_file_list) - data_file = sys.argv[5] - movie_inference(model_name, network_file, dense_file, embedding_file_list, data_file, True) - movie_inference(model_name, network_file, dense_file, embedding_file_list, data_file, False) diff --git a/test/inference/inference_session/wdl_multitable_test.py b/test/inference/inference_session/wdl_multitable_test.py deleted file mode 100755 index 6b7a18ba7a..0000000000 --- a/test/inference/inference_session/wdl_multitable_test.py +++ /dev/null @@ -1,157 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from hugectr.inference import InferenceParams, CreateInferenceSession -import pandas as pd -import numpy as np -import sys - - -# from mpi4py import MPI -def wdl_inference( - model_name, network_file, dense_file, embedding_file_list, data_file, enable_cache -): - CATEGORICAL_COLUMNS = ["C" + str(x) for x in range(1, 27)] + ["C1_C2", "C3_C4"] - CONTINUOUS_COLUMNS = ["I" + str(x) for x in range(1, 14)] - LABEL_COLUMNS = ["label"] - emb_size = [ - 202546, - 18795, - 14099, - 6889, - 18577, - 4, - 6349, - 1247, - 48, - 186730, - 71084, - 66832, - 11, - 2158, - 7415, - 61, - 4, - 923, - 15, - 202617, - 143251, - 198823, - 61025, - 9057, - 73, - 34, - 225812, - 354963, - ] - shift = np.insert(np.cumsum(emb_size), 0, 0)[:-1] - result = [ - 0.05634006857872009, - 0.04185676947236061, - 0.007268941029906273, - 0.10255379974842072, - 0.14059557020664215, - 0.011040309444069862, - 0.005499477963894606, - 0.24404558539390564, - 0.012491216883063316, - 0.005486942362040281, - ] - - test_df = pd.read_csv(data_file) - config_file = network_file - row_ptrs = list(range(0, 21)) + list(range(0, 261)) - dense_features = list(test_df[CONTINUOUS_COLUMNS].values.flatten()) - test_df[CATEGORICAL_COLUMNS].astype(np.int64) - embedding_columns = list((test_df[CATEGORICAL_COLUMNS] + shift).values.flatten()) - - hash_map_database = hugectr.inference.VolatileDatabaseParams() - rocksdb_database = hugectr.inference.PersistentDatabaseParams( - path="/hugectr/test/utest/wdl_test_files/rocksdb" - ) - - # create parameter server, embedding cache and inference session - inference_params = InferenceParams( - model_name=model_name, - max_batchsize=64, - hit_rate_threshold=1.0, - dense_model_file=dense_file, - sparse_model_files=embedding_file_list, - device_id=0, - use_gpu_embedding_cache=enable_cache, - cache_size_percentage=0.9, - i64_input_key=True, - use_mixed_precision=False, - number_of_worker_buffers_in_pool=4, - number_of_refresh_buffers_in_pool=1, - deployed_devices=[0], - default_value_for_each_table=[0.0, 0.0], - volatile_db=hash_map_database, - persistent_db=rocksdb_database, - cache_refresh_percentage_per_iteration=0.1, - ) - inference_session = CreateInferenceSession(config_file, inference_params) - # predict for the first time - output1 = inference_session.predict(dense_features, embedding_columns, row_ptrs) - miss1 = np.mean((np.array(output1) - np.array(result)) ** 2) - # refresh emebdding cache, void operation since there is no update for the parameter server - inference_session.refresh_embedding_cache() - # predict for the second time - output2 = inference_session.predict(dense_features, embedding_columns, row_ptrs) - miss2 = np.mean((np.array(output2) - np.array(result)) ** 2) - print("WDL multi-embedding table inference result should be {}".format(result)) - miss = max(miss1, miss2) - if enable_cache: - if miss > 0.0001: - raise RuntimeError( - "WDL multi-embedding table inference using GPU cache, prediction error is greater than threshold: {}, error is {}".format( - 0.0001, miss - ) - ) - sys.exit(1) - else: - print( - "[HUGECTR][INFO] WDL multi-embedding table inference using GPU cache, prediction error is less than threshold:{}, error is {}".format( - 0.0001, miss - ) - ) - else: - if miss > 0.0001: - raise RuntimeError( - "[HUGECTR][INFO] WDL multi-embedding table inference without GPU cache, prediction error is greater than threshold:{}, error is {}".format( - 0.0001, miss - ) - ) - sys.exit(1) - else: - print( - "[HUGECTR][INFO] WDL multi-embedding table inference without GPU cache, prediction error is less than threshold: {}, error is {}".format( - 0.0001, miss - ) - ) - - -if __name__ == "__main__": - model_name = sys.argv[1] - network_file = sys.argv[2] - dense_file = sys.argv[3] - embedding_file_list = str(sys.argv[4]).split(",") - print(embedding_file_list) - data_file = sys.argv[5] - # wdl_inference(model_name, network_file, dense_file, embedding_file_list, data_file, True, hugectr.Database_t.RocksDB) - wdl_inference(model_name, network_file, dense_file, embedding_file_list, data_file, True) - wdl_inference(model_name, network_file, dense_file, embedding_file_list, data_file, False) diff --git a/test/notebook_test/notebook_hugectr.py b/test/notebook_test/notebook_hugectr.py index 0bbe3b288c..d0a5c59eaa 100755 --- a/test/notebook_test/notebook_hugectr.py +++ b/test/notebook_test/notebook_hugectr.py @@ -33,17 +33,6 @@ TEST_PATH = dirname(dirname(realpath(__file__))) -@pytest.mark.skipif(hugectr is None, reason="hugectr not installed") -def test_multi_gpu_offline_inference(): - notebook = os.path.join(dirname(TEST_PATH), "notebooks/multi_gpu_offline_inference.ipynb") - with testbook( - notebook, - execute=False, - timeout=3600, - ) as nb: - nb.execute_cell(list(range(0, len(nb.cells)))) - - def test_prototype_indices(): notebook = os.path.join(dirname(TEST_PATH), "notebooks/prototype_indices.ipynb") with testbook( diff --git a/test/onnx_converter_test/hugectr2onnx_dcn_test.py b/test/onnx_converter_test/hugectr2onnx_dcn_test.py index c6abde6614..3e3791ab7a 100644 --- a/test/onnx_converter_test/hugectr2onnx_dcn_test.py +++ b/test/onnx_converter_test/hugectr2onnx_dcn_test.py @@ -15,7 +15,6 @@ """ import hugectr -from hugectr.inference import InferenceParams, CreateInferenceSession import hugectr2onnx import onnxruntime as ort from utils import read_samples_for_dcn, compare_array_approx @@ -32,6 +31,7 @@ def hugectr2onnx_dcn_test( sparse_models, onnx_model_path, model_name, + ground_truth, ntp_file="", ): hugectr2onnx.converter.convert( @@ -47,22 +47,7 @@ def hugectr2onnx_dcn_test( batch_size * num_batches, ) - inference_params = InferenceParams( - model_name=model_name, - max_batchsize=batch_size, - hit_rate_threshold=1, - dense_model_file=dense_model, - sparse_model_files=sparse_models, - device_id=0, - use_gpu_embedding_cache=True, - cache_size_percentage=0.6, - i64_input_key=False, - non_trainable_params_file=ntp_file, - ) - inference_session = CreateInferenceSession(graph_config, inference_params) - predictions = inference_session.predict( - num_batches, data_source, hugectr.DataReaderType_t.Norm, hugectr.Check_t.Sum - ) + predictions = np.load(ground_truth).reshape(batch_size * num_batches) compare_array_approx(res, predictions, model_name, 1e-3, 1e-2) @@ -77,6 +62,7 @@ def hugectr2onnx_dcn_test( ["/onnx_converter/hugectr_models/dcn0_sparse_2000.model"], "/onnx_converter/onnx_models/dcn.onnx", "dcn", + "/onnx_converter/hugectr_models/dcn_preds.npy", ) hugectr2onnx_dcn_test( 64, @@ -88,6 +74,7 @@ def hugectr2onnx_dcn_test( ["/onnx_converter/hugectr_models/deepfm0_sparse_2000.model"], "/onnx_converter/onnx_models/deepfm.onnx", "deepfm", + "/onnx_converter/hugectr_models/deepfm_preds.npy", ) hugectr2onnx_dcn_test( 64, @@ -99,6 +86,7 @@ def hugectr2onnx_dcn_test( ["/onnx_converter/hugectr_models/dlrm0_sparse_2000.model"], "/onnx_converter/onnx_models/dlrm.onnx", "dlrm", + "/onnx_converter/hugectr_models/dlrm_preds.npy", "/onnx_converter/hugectr_models/dlrm_dense_2000.model.ntp.json", ) hugectr2onnx_dcn_test( @@ -111,4 +99,5 @@ def hugectr2onnx_dcn_test( ["/onnx_converter/hugectr_models/dlrm_mlp0_sparse_2000.model"], "/onnx_converter/onnx_models/dlrm_mlp.onnx", "dlrm_mlp", + "/onnx_converter/hugectr_models/dlrm_mlp_preds.npy", ) diff --git a/test/onnx_converter_test/hugectr2onnx_din_test.py b/test/onnx_converter_test/hugectr2onnx_din_test.py index 9a9a0b1e60..a239f4cf78 100644 --- a/test/onnx_converter_test/hugectr2onnx_din_test.py +++ b/test/onnx_converter_test/hugectr2onnx_din_test.py @@ -15,7 +15,6 @@ """ import hugectr -from hugectr.inference import InferenceParams, CreateInferenceSession import hugectr2onnx import onnxruntime as ort from utils import read_samples_for_din, compare_array_approx @@ -32,6 +31,7 @@ def hugectr2onnx_din_test( sparse_models, onnx_model_path, model_name, + ground_truth, ): hugectr2onnx.converter.convert(onnx_model_path, graph_config, dense_model, True, sparse_models) dense, user, good, cate = read_samples_for_din(data_file, batch_size * num_batches, slot_num=23) @@ -49,50 +49,7 @@ def hugectr2onnx_din_test( batch_size * num_batches, ) - inference_params = InferenceParams( - model_name=model_name, - max_batchsize=batch_size, - hit_rate_threshold=1, - dense_model_file=dense_model, - sparse_model_files=sparse_models, - device_id=0, - use_gpu_embedding_cache=True, - cache_size_percentage=0.6, - i64_input_key=True, - ) - inference_session = CreateInferenceSession(graph_config, inference_params) - slot_size_array = [ - 192403, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 63001, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 801, - ] - predictions = inference_session.predict( - num_batches, - data_source, - hugectr.DataReaderType_t.Parquet, - hugectr.Check_t.Non, - slot_size_array, - ) + predictions = np.load(ground_truth).reshape(batch_size * num_batches) compare_array_approx(res, predictions, model_name, 1e-2, 1e-1) @@ -112,6 +69,7 @@ def hugectr2onnx_din_test( ], "/onnx_converter/onnx_models/din.onnx", "din", + "/onnx_converter/hugectr_models/din_preds.npy", ) hugectr2onnx_din_test( 64, @@ -127,4 +85,5 @@ def hugectr2onnx_din_test( ], "/onnx_converter/onnx_models/din_try.onnx", "din_try", + "/onnx_converter/hugectr_models/din_try_preds.npy", ) diff --git a/test/onnx_converter_test/hugectr2onnx_mmoe_test.py b/test/onnx_converter_test/hugectr2onnx_mmoe_test.py index edaf2b20d6..b3e6638d56 100644 --- a/test/onnx_converter_test/hugectr2onnx_mmoe_test.py +++ b/test/onnx_converter_test/hugectr2onnx_mmoe_test.py @@ -17,16 +17,12 @@ import hugectr from mpi4py import MPI -from hugectr.inference import InferenceParams, CreateInferenceSession import hugectr2onnx import onnxruntime as ort from utils import read_samples_for_mmoe, compare_array_approx import numpy as np -from hugectr.inference import InferenceModel, InferenceParams -import numpy as np - - +ground_truth = "/onnx_converter/hugectr_models/mmoe_parquet_preds.npy" graph_config = "/onnx_converter/graph_files/mmoe.json" dense_model = "/onnx_converter/hugectr_models/mmoe_dense_2000.model" sparse_models = ["/onnx_converter/hugectr_models/mmoe0_sparse_2000.model"] @@ -85,26 +81,8 @@ onnx_preds = np.concatenate((preds0, preds1), axis=1) print("onnx_preds.shape: ", onnx_preds.shape) -inference_params = InferenceParams( - model_name="mmoe", - max_batchsize=batch_size, - hit_rate_threshold=1.0, - dense_model_file="/onnx_converter/hugectr_models/mmoe_dense_2000.model", - sparse_model_files=["/onnx_converter/hugectr_models/mmoe0_sparse_2000.model"], - device_id=0, - use_gpu_embedding_cache=False, - cache_size_percentage=1.0, - i64_input_key=False, - use_mixed_precision=False, - use_cuda_graph=True, -) -inference_session = CreateInferenceSession( - "/onnx_converter/graph_files/mmoe.json", inference_params -) +predictions = np.load(ground_truth).reshape(batch_size * num_batches, 2) -predictions = inference_session.predict( - num_batches, data_source, hugectr.DataReaderType_t.Parquet, hugectr.Check_t.Non, slot_size_array -) print("predictions.shape: ", predictions.shape) compare_array_approx(onnx_preds, predictions, "mmoe", 1e-2, 1e-1) diff --git a/test/onnx_converter_test/hugectr2onnx_ncf_test.py b/test/onnx_converter_test/hugectr2onnx_ncf_test.py index c3259c9f0a..0db2d8db3b 100644 --- a/test/onnx_converter_test/hugectr2onnx_ncf_test.py +++ b/test/onnx_converter_test/hugectr2onnx_ncf_test.py @@ -15,7 +15,6 @@ """ import hugectr -from hugectr.inference import InferenceParams, CreateInferenceSession import hugectr2onnx import onnxruntime as ort from utils import read_samples_for_ncf, compare_array_approx @@ -32,6 +31,7 @@ def hugectr2onnx_ncf_test( sparse_models, onnx_model_path, model_name, + ground_truth, ): hugectr2onnx.converter.convert(onnx_model_path, graph_config, dense_model, True, sparse_models) label, dense, keys = read_samples_for_ncf(data_file, batch_size * num_batches, slot_num=2) @@ -44,22 +44,7 @@ def hugectr2onnx_ncf_test( batch_size * num_batches, ) - inference_params = InferenceParams( - model_name=model_name, - max_batchsize=batch_size, - hit_rate_threshold=1, - dense_model_file=dense_model, - sparse_model_files=sparse_models, - device_id=0, - use_gpu_embedding_cache=True, - cache_size_percentage=0.6, - i64_input_key=False, - ) - inference_session = CreateInferenceSession(graph_config, inference_params) - predictions = inference_session.predict( - num_batches, data_source, hugectr.DataReaderType_t.Norm, hugectr.Check_t.Non - ) - + predictions = np.load(ground_truth).reshape(batch_size * num_batches) compare_array_approx(res, predictions, model_name, 1e-3, 1e-2) @@ -74,6 +59,7 @@ def hugectr2onnx_ncf_test( ["/onnx_converter/hugectr_models/ncf0_sparse_2000.model"], "/onnx_converter/onnx_models/ncf.onnx", "ncf", + "/onnx_converter/hugectr_models/ncf_preds.npy", ) hugectr2onnx_ncf_test( 64, @@ -85,6 +71,7 @@ def hugectr2onnx_ncf_test( ["/onnx_converter/hugectr_models/gmf0_sparse_2000.model"], "/onnx_converter/onnx_models/gmf.onnx", "gmf", + "/onnx_converter/hugectr_models/gmf_preds.npy", ) hugectr2onnx_ncf_test( 64, @@ -96,4 +83,5 @@ def hugectr2onnx_ncf_test( ["/onnx_converter/hugectr_models/neumf0_sparse_2000.model"], "/onnx_converter/onnx_models/neumf.onnx", "neumf", + "/onnx_converter/hugectr_models/neumf_preds.npy", ) diff --git a/test/onnx_converter_test/hugectr2onnx_wdl_test.py b/test/onnx_converter_test/hugectr2onnx_wdl_test.py index 58cd56298b..383a31b4cf 100644 --- a/test/onnx_converter_test/hugectr2onnx_wdl_test.py +++ b/test/onnx_converter_test/hugectr2onnx_wdl_test.py @@ -15,7 +15,6 @@ """ import hugectr -from hugectr.inference import InferenceParams, CreateInferenceSession import hugectr2onnx import onnxruntime as ort from utils import read_samples_for_wdl, compare_array_approx @@ -32,6 +31,7 @@ def hugectr2onnx_wdl_test( sparse_models, onnx_model_path, model_name, + ground_truth, ): hugectr2onnx.converter.convert(onnx_model_path, graph_config, dense_model, True, sparse_models) label, dense, wide_data, deep_data = read_samples_for_wdl( @@ -50,22 +50,7 @@ def hugectr2onnx_wdl_test( batch_size * num_batches, ) - inference_params = InferenceParams( - model_name=model_name, - max_batchsize=batch_size, - hit_rate_threshold=1, - dense_model_file=dense_model, - sparse_model_files=sparse_models, - device_id=0, - use_gpu_embedding_cache=True, - cache_size_percentage=0.6, - i64_input_key=False, - ) - inference_session = CreateInferenceSession(graph_config, inference_params) - predictions = inference_session.predict( - num_batches, data_source, hugectr.DataReaderType_t.Norm, hugectr.Check_t.Sum - ) - + predictions = np.load(ground_truth).reshape(batch_size * num_batches) compare_array_approx(res, predictions, model_name, 1e-3, 1e-2) @@ -83,4 +68,5 @@ def hugectr2onnx_wdl_test( ], "/onnx_converter/onnx_models/wdl.onnx", "wdl", + "/onnx_converter/hugectr_models/wdl_preds.npy", ) diff --git a/test/onnx_converter_test/train_scripts/dcn.py b/test/onnx_converter_test/train_scripts/dcn.py index e774f96f7b..4e5c63b780 100644 --- a/test/onnx_converter_test/train_scripts/dcn.py +++ b/test/onnx_converter_test/train_scripts/dcn.py @@ -18,8 +18,8 @@ from mpi4py import MPI solver = hugectr.CreateSolver( - max_eval_batches=300, - batchsize_eval=16384, + max_eval_batches=1, + batchsize_eval=6400, batchsize=16384, lr=0.001, vvgpu=[[0]], @@ -154,7 +154,12 @@ model.fit( max_iter=2300, display=200, - eval_interval=1000, + eval_interval=2000, snapshot=2000, snapshot_prefix="/onnx_converter/hugectr_models/dcn", ) + +import numpy as np + +preds = model.check_out_tensor("fc3", hugectr.Tensor_t.Evaluate) +np.save("/onnx_converter/hugectr_models/dcn_preds.npy", preds) diff --git a/test/onnx_converter_test/train_scripts/deepfm.py b/test/onnx_converter_test/train_scripts/deepfm.py index 10420c0ef5..783b4a57a8 100644 --- a/test/onnx_converter_test/train_scripts/deepfm.py +++ b/test/onnx_converter_test/train_scripts/deepfm.py @@ -18,8 +18,8 @@ from mpi4py import MPI solver = hugectr.CreateSolver( - max_eval_batches=300, - batchsize_eval=16384, + max_eval_batches=1, + batchsize_eval=6400, batchsize=16384, lr=0.001, vvgpu=[[0]], @@ -246,7 +246,12 @@ model.fit( max_iter=2300, display=200, - eval_interval=1000, + eval_interval=2000, snapshot=2000, snapshot_prefix="/onnx_converter/hugectr_models/deepfm", ) + +import numpy as np + +preds = model.check_out_tensor("add", hugectr.Tensor_t.Evaluate) +np.save("/onnx_converter/hugectr_models/deepfm_preds.npy", preds) diff --git a/test/onnx_converter_test/train_scripts/din_parquet.py b/test/onnx_converter_test/train_scripts/din_parquet.py index 6dc6ea2779..002af208e8 100644 --- a/test/onnx_converter_test/train_scripts/din_parquet.py +++ b/test/onnx_converter_test/train_scripts/din_parquet.py @@ -19,7 +19,7 @@ solver = hugectr.CreateSolver( max_eval_batches=1, - batchsize_eval=4096, + batchsize_eval=6400, batchsize=64, lr=0.001, vvgpu=[[0]], @@ -340,7 +340,12 @@ model.fit( max_iter=8100, display=200, - eval_interval=1000, + eval_interval=8000, snapshot=8000, snapshot_prefix="/onnx_converter/hugectr_models/din", ) + +import numpy as np + +preds = model.check_out_tensor("fc3", hugectr.Tensor_t.Evaluate) +np.save("/onnx_converter/hugectr_models/din_preds.npy", preds) diff --git a/test/onnx_converter_test/train_scripts/din_try.py b/test/onnx_converter_test/train_scripts/din_try.py index 504e48ad3d..5a23538e65 100644 --- a/test/onnx_converter_test/train_scripts/din_try.py +++ b/test/onnx_converter_test/train_scripts/din_try.py @@ -19,7 +19,7 @@ solver = hugectr.CreateSolver( max_eval_batches=1, - batchsize_eval=4096, + batchsize_eval=6400, batchsize=64, lr=0.00001, vvgpu=[[0]], @@ -344,7 +344,12 @@ model.fit( max_iter=88000, display=1000, - eval_interval=1000, + eval_interval=80000, snapshot=80000, snapshot_prefix="/onnx_converter/hugectr_models/din_try", ) + +import numpy as np + +preds = model.check_out_tensor("fc3", hugectr.Tensor_t.Evaluate) +np.save("/onnx_converter/hugectr_models/din_try_preds.npy", preds) diff --git a/test/onnx_converter_test/train_scripts/dlrm.py b/test/onnx_converter_test/train_scripts/dlrm.py index 79f9782a1a..cd55f70c1f 100644 --- a/test/onnx_converter_test/train_scripts/dlrm.py +++ b/test/onnx_converter_test/train_scripts/dlrm.py @@ -18,8 +18,8 @@ from mpi4py import MPI solver = hugectr.CreateSolver( - max_eval_batches=300, - batchsize_eval=16384, + max_eval_batches=1, + batchsize_eval=6400, batchsize=16384, lr=0.001, vvgpu=[[0]], @@ -197,7 +197,12 @@ model.fit( max_iter=2300, display=200, - eval_interval=1000, + eval_interval=2000, snapshot=2000, snapshot_prefix="/onnx_converter/hugectr_models/dlrm", ) + +import numpy as np + +preds = model.check_out_tensor("fc8", hugectr.Tensor_t.Evaluate) +np.save("/onnx_converter/hugectr_models/dlrm_preds.npy", preds) diff --git a/test/onnx_converter_test/train_scripts/dlrm_mlp.py b/test/onnx_converter_test/train_scripts/dlrm_mlp.py index 71ed770ce6..03018ca2a8 100644 --- a/test/onnx_converter_test/train_scripts/dlrm_mlp.py +++ b/test/onnx_converter_test/train_scripts/dlrm_mlp.py @@ -18,8 +18,8 @@ from mpi4py import MPI solver = hugectr.CreateSolver( - max_eval_batches=300, - batchsize_eval=16384, + max_eval_batches=1, + batchsize_eval=6400, batchsize=16384, lr=0.001, vvgpu=[[0]], @@ -104,7 +104,12 @@ model.fit( max_iter=2300, display=200, - eval_interval=1000, + eval_interval=2000, snapshot=2000, snapshot_prefix="/onnx_converter/hugectr_models/dlrm_mlp", ) + +import numpy as np + +preds = model.check_out_tensor("mlp2", hugectr.Tensor_t.Evaluate) +np.save("/onnx_converter/hugectr_models/dlrm_mlp_preds.npy", preds) diff --git a/test/onnx_converter_test/train_scripts/gmf.py b/test/onnx_converter_test/train_scripts/gmf.py index afa333af1e..dcaed6dc09 100644 --- a/test/onnx_converter_test/train_scripts/gmf.py +++ b/test/onnx_converter_test/train_scripts/gmf.py @@ -18,8 +18,8 @@ from mpi4py import MPI solver = hugectr.CreateSolver( - max_eval_batches=1000, - batchsize_eval=2770, + max_eval_batches=1, + batchsize_eval=6400, batchsize=17548, lr=0.0045, vvgpu=[[0]], @@ -109,7 +109,12 @@ model.fit( max_iter=2100, display=200, - eval_interval=1000, + eval_interval=2000, snapshot=2000, snapshot_prefix="/onnx_converter/hugectr_models//gmf", ) + +import numpy as np + +preds = model.check_out_tensor("gmf_out", hugectr.Tensor_t.Evaluate) +np.save("/onnx_converter/hugectr_models/gmf_preds.npy", preds) diff --git a/test/onnx_converter_test/train_scripts/mmoe_parquet.py b/test/onnx_converter_test/train_scripts/mmoe_parquet.py index d40d0d6982..058998d581 100644 --- a/test/onnx_converter_test/train_scripts/mmoe_parquet.py +++ b/test/onnx_converter_test/train_scripts/mmoe_parquet.py @@ -21,8 +21,8 @@ NUM_TASKS = 2 solver = hugectr.CreateSolver( - max_eval_batches=100, - batchsize_eval=762, + max_eval_batches=1, + batchsize_eval=16384, batchsize=641, lr=0.001, vvgpu=[[0]], @@ -550,7 +550,14 @@ model.fit( max_iter=2300, display=200, - eval_interval=1000, + eval_interval=2000, snapshot=2000, snapshot_prefix="/onnx_converter/hugectr_models/mmoe", ) + +import numpy as np + +preds1 = model.check_out_tensor("A_fc2", hugectr.Tensor_t.Evaluate) +preds2 = model.check_out_tensor("B_fc2", hugectr.Tensor_t.Evaluate) +preds = np.concatenate([preds1, preds2], axis=1) +np.save("/onnx_converter/hugectr_models/mmoe_parquet_preds.npy", preds) diff --git a/test/onnx_converter_test/train_scripts/ncf.py b/test/onnx_converter_test/train_scripts/ncf.py index 5739ce56a2..e19acf608e 100644 --- a/test/onnx_converter_test/train_scripts/ncf.py +++ b/test/onnx_converter_test/train_scripts/ncf.py @@ -18,8 +18,8 @@ from mpi4py import MPI solver = hugectr.CreateSolver( - max_eval_batches=1000, - batchsize_eval=2770, # 1208 for 1M dataset + max_eval_batches=1, + batchsize_eval=6400, # 1208 for 1M dataset batchsize=17548, # 32205 for 1M dataset lr=0.0045, vvgpu=[[0]], @@ -173,7 +173,12 @@ model.fit( max_iter=2100, display=200, - eval_interval=1000, + eval_interval=2000, snapshot=2000, snapshot_prefix="/onnx_converter/hugectr_models/ncf", ) + +import numpy as np + +preds = model.check_out_tensor("mlp_out", hugectr.Tensor_t.Evaluate) +np.save("/onnx_converter/hugectr_models/ncf_preds.npy", preds) diff --git a/test/onnx_converter_test/train_scripts/neumf.py b/test/onnx_converter_test/train_scripts/neumf.py index 87a15d2350..a4a0496b1b 100644 --- a/test/onnx_converter_test/train_scripts/neumf.py +++ b/test/onnx_converter_test/train_scripts/neumf.py @@ -18,8 +18,8 @@ from mpi4py import MPI solver = hugectr.CreateSolver( - max_eval_batches=1000, - batchsize_eval=2770, + max_eval_batches=1, + batchsize_eval=6400, batchsize=17548, lr=0.0045, vvgpu=[[0]], @@ -211,7 +211,12 @@ model.fit( max_iter=2100, display=200, - eval_interval=1000, + eval_interval=2000, snapshot=2000, snapshot_prefix="/onnx_converter/hugectr_models/neumf", ) + +import numpy as np + +preds = model.check_out_tensor("neumf_out", hugectr.Tensor_t.Evaluate) +np.save("/onnx_converter/hugectr_models/neumf_preds.npy", preds) diff --git a/test/onnx_converter_test/train_scripts/wdl.py b/test/onnx_converter_test/train_scripts/wdl.py index f3f85180dc..52dbd9e4a9 100644 --- a/test/onnx_converter_test/train_scripts/wdl.py +++ b/test/onnx_converter_test/train_scripts/wdl.py @@ -18,8 +18,8 @@ from mpi4py import MPI solver = hugectr.CreateSolver( - max_eval_batches=300, - batchsize_eval=16384, + max_eval_batches=1, + batchsize_eval=6400, batchsize=16384, lr=0.001, vvgpu=[[0]], @@ -158,7 +158,12 @@ model.fit( max_iter=2300, display=200, - eval_interval=1000, + eval_interval=2000, snapshot=2000, snapshot_prefix="/onnx_converter/hugectr_models/wdl", ) + +import numpy as np + +preds = model.check_out_tensor("add1", hugectr.Tensor_t.Evaluate) +np.save("/onnx_converter/hugectr_models/wdl_preds.npy", preds) diff --git a/test/pybind_test/wdl_fp16_8gpu.py b/test/pybind_test/wdl_fp16_8gpu.py index 9c561eb131..817be61af7 100644 --- a/test/pybind_test/wdl_fp16_8gpu.py +++ b/test/pybind_test/wdl_fp16_8gpu.py @@ -45,7 +45,7 @@ def wdl_test(json_file): model.compile() model.summary() model.fit( - max_iter=10000, display=200, eval_interval=1000, snapshot=10000, snapshot_prefix="wdl" + max_iter=10000, display=200, eval_interval=1000, snapshot=100000, snapshot_prefix="wdl" ) return diff --git a/test/utest/inference/CMakeLists.txt b/test/utest/inference/CMakeLists.txt deleted file mode 100755 index 0535a45710..0000000000 --- a/test/utest/inference/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -# -# Copyright (c) 2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -cmake_minimum_required(VERSION 3.20) - -file(GLOB inference_test_src - embedding_feature_combiner_test.cpp - preallocated_buffer2_test.cpp - session_inference_test.cpp - cpu_inference_test.cpp - cpu_multicross_layer_test.cpp -) - -add_executable(inference_test ${inference_test_src}) -target_compile_features(inference_test PUBLIC cxx_std_17) -target_link_libraries(inference_test PUBLIC huge_ctr_inference cpu_inference_shared gtest gtest_main stdc++fs) -set_target_properties(inference_test PROPERTIES CUDA_ARCHITECTURES OFF) \ No newline at end of file diff --git a/test/utest/inference/cpu_inference_test.cpp b/test/utest/inference/cpu_inference_test.cpp deleted file mode 100644 index bcee6eebd1..0000000000 --- a/test/utest/inference/cpu_inference_test.cpp +++ /dev/null @@ -1,345 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; - -namespace { - -const int RANGE[] = {0, 1460, 2018, 337396, 549106, 549411, 549431, - 561567, 562200, 562203, 613501, 618803, 951403, 954582, - 954609, 966800, 1268011, 1268021, 1272862, 1274948, 1274952, - 1599225, 1599242, 1599257, 1678991, 1679087, 1737709}; - -std::vector& split(const std::string& s, char delim, std::vector& elems) { - std::istringstream is(s); - std::string item; - while (std::getline(is, item, delim)) { - elems.push_back(item); - } - return elems; -} - -struct InferenceInfo { - int dense_dim; - std::vector slot_num; - std::vector max_feature_num_per_sample; - std::vector embedding_vec_size; - std::vector combiner_type; - InferenceInfo(const nlohmann::json& config); -}; - -InferenceInfo::InferenceInfo(const nlohmann::json& config) { - auto j_layers_array = get_json(config, "layers"); - const nlohmann::json& j_data = j_layers_array[0]; - auto j_dense = get_json(j_data, "dense"); - dense_dim = get_value_from_json(j_dense, "dense_dim"); - auto j_sparse_inputs = get_json(j_data, "sparse"); - - for (size_t i = 0; i < j_sparse_inputs.size(); i++) { - const nlohmann::json& j_sparse = j_sparse_inputs[0]; - slot_num.push_back(get_value_from_json(j_sparse, "slot_num")); - - size_t max_feature_num_per_sample_ = - static_cast(get_max_feature_num_per_sample_from_nnz_per_slot(j_sparse)); - - max_feature_num_per_sample.push_back(max_feature_num_per_sample_); - } - - // get embedding params: embedding_vec_size, combiner_type - for (size_t i = 1; i < j_layers_array.size(); i++) { - // if not embedding then break - const nlohmann::json& j = j_layers_array[i]; - auto embedding_name = get_value_from_json(j, "type"); - if (embedding_name.compare("DistributedSlotSparseEmbeddingHash") != 0 && - embedding_name.compare("LocalizedSlotSparseEmbeddingHash") != 0 && - embedding_name.compare("LocalizedSlotSparseEmbeddingOneHot") != 0) { - break; - } - auto j_embed_params = get_json(j, "sparse_embedding_hparam"); - auto vec_size = get_value_from_json(j_embed_params, "embedding_vec_size"); - auto combiner = get_value_from_json(j_embed_params, "combiner"); - embedding_vec_size.push_back(vec_size); - if (combiner == "mean") { - combiner_type.push_back(HugeCTR::EmbeddingFeatureCombiner_t::Mean); - } else { - combiner_type.push_back(HugeCTR::EmbeddingFeatureCombiner_t::Sum); - } - } -} - -template -void session_inference_criteo_test(const std::string& config_file, const std::string& model, - const std::string& criteo_data_path, int batchsize) { - InferenceInfo inference_info(read_json_file(config_file)); - int batch_size = batchsize; - int dense_dim = inference_info.dense_dim; - int slot_num = inference_info.slot_num[0]; - int max_feature_num_per_sample = inference_info.max_feature_num_per_sample[0]; - int num_samples = 0; - std::vector labels; - std::vector dense_features; - std::vector keys; - std::vector row_ptrs; - HostAllocator host_allocator; - HugeCTR::Timer timer_inference; - - // open criteo data file - std::ifstream criteo_data_file(criteo_data_path, std::ifstream::binary); - if (!criteo_data_file.is_open()) { - HCTR_LOG_S(ERROR, WORLD) << "Cannot open " << criteo_data_path << std::endl; - } - - // 4 lines: labels, dense_features, keys, row_ptrs - for (int i = 0; i < 4; i++) { - std::string line; - std::getline(criteo_data_file, line); - std::vector vec_string; - split(line, ' ', vec_string); - switch (i) { - case 0: { - num_samples = static_cast(vec_string.size()); - for (int j = 0; j < num_samples; j++) { - int label = std::stoi(vec_string[j]); - labels.push_back(label); - } - break; - } - case 1: { - int dense_features_dim = static_cast(vec_string.size()); - if (dense_features_dim != num_samples * dense_dim) { - HCTR_LOG_S(ERROR, WORLD) - << "dense_features_dim does not equal to num_samples*dense_dim" << std::endl; - } - for (int j = 0; j < dense_features_dim; j++) { - float dense_feature = std::stod(vec_string[j]); - dense_features.push_back(dense_feature); - } - break; - } - case 2: { - int keys_dim = static_cast(vec_string.size()); - if (keys_dim != num_samples * slot_num) { - HCTR_LOG_S(ERROR, WORLD) - << "keys_dim does not equal to num_samples*slot_num" << std::endl; - } - for (int j = 0; j < keys_dim; j++) { - TypeHashKey key = static_cast(std::stoll(vec_string[j])); - keys.push_back(key); - } - break; - } - case 3: { - int row_ptrs_dim = static_cast(vec_string.size()); - if (row_ptrs_dim != num_samples * slot_num + 1) { - HCTR_LOG_S(ERROR, WORLD) - << "row_ptrs_dim does not equal to num_samples*slot_num + 1" << std::endl; - } - for (int j = 0; j < row_ptrs_dim; j++) { - int row_ptr = std::stoi(vec_string[j]); - row_ptrs.push_back(row_ptr); - } - break; - } - default: { - assert(!"Error: Should never get here!"); - } - } - } - - if (batch_size == 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "batch size should not be zero!"); - } - num_samples = num_samples < batch_size ? num_samples : batch_size; - - // h_row_ptrs - std::vector row_ptrs_dims = {static_cast(batch_size * slot_num + 1)}; // 1D - size_t row_ptrs_size = 1; - for (auto dim : row_ptrs_dims) { - row_ptrs_size *= dim; - } - size_t row_ptrs_size_samples = num_samples * slot_num + 1; - size_t row_ptrs_size_in_bytes = row_ptrs_size * sizeof(int); - size_t row_ptrs_size_in_bytes_samples = row_ptrs_size_samples * sizeof(int); - int* h_row_ptrs = reinterpret_cast(host_allocator.allocate(row_ptrs_size_in_bytes)); - for (size_t i = 0; i < row_ptrs_size; i++) { - h_row_ptrs[i] = 0; - } - - // h_dense_features - size_t dense_size = batch_size * dense_dim; - size_t dense_size_samples = num_samples * dense_dim; - size_t dense_size_in_bytes = dense_size * sizeof(float); - size_t dense_size_in_bytes_samples = dense_size_samples * sizeof(float); - float* h_dense_features = reinterpret_cast(host_allocator.allocate(dense_size_in_bytes)); - - // h_embeddingcolumns - size_t embeddingcolumns_size = batch_size * max_feature_num_per_sample; - size_t embeddingcolumns_size_samples = num_samples * max_feature_num_per_sample; - size_t embeddingcolumns_size_in_bytes = embeddingcolumns_size * sizeof(TypeHashKey); - size_t embeddingcolumns_size_in_bytes_samples = - embeddingcolumns_size_samples * sizeof(TypeHashKey); - void* h_embeddingcolumns = host_allocator.allocate(embeddingcolumns_size_in_bytes); - // TypeHashKey* h_keys = reinterpret_cast(h_embeddingcolumns); - - // h_output - std::unique_ptr h_out(new float[batch_size]); - - // memory copy - memcpy(h_embeddingcolumns, keys.data(), embeddingcolumns_size_in_bytes_samples); - memcpy(h_row_ptrs, row_ptrs.data(), row_ptrs_size_in_bytes_samples); - memcpy(h_dense_features, dense_features.data(), dense_size_in_bytes_samples); - - // inference session - std::string dense_model{"/hugectr/test/utest/_dense_10000.model"}; - std::vector sparse_models{"/hugectr/test/utest/0_sparse_10000.model"}; - InferenceParams infer_param(model, batchsize, 0.5, dense_model, sparse_models, 0, true, 0.8, - false); - std::vector inference_params{infer_param}; - std::vector model_config_path{config_file}; - parameter_server_config ps_config{model_config_path, inference_params}; - std::shared_ptr parameter_server = - HierParameterServerBase::create(ps_config); - InferenceSessionCPU sess(model_config_path[0], inference_params[0], - parameter_server); - timer_inference.start(); - sess.predict(h_dense_features, h_embeddingcolumns, h_row_ptrs, h_out.get(), num_samples); - timer_inference.stop(); - - { - auto log = HCTR_LOG_S(INFO, WORLD); - log << "==========================labels===================" << std::endl; - for (int i = 0; i < num_samples; i++) { - log << labels[i] << " "; - } - log << std::endl; - } - { - auto log = HCTR_LOG_S(INFO, WORLD); - log << "==========================prediction result===================" << std::endl; - for (int i = 0; i < num_samples; i++) { - log << h_out[i] << " "; - } - log << std::endl; - } - HCTR_LOG_S(INFO, ROOT) << "Batch size: " << batch_size << ", Number samples: " << num_samples - << ", Time: " << timer_inference.elapsedSeconds() << "s" << std::endl; - host_allocator.deallocate(h_embeddingcolumns); - host_allocator.deallocate(h_dense_features); - host_allocator.deallocate(h_row_ptrs); -} - -template -void session_inference_generated_test(const std::string& config_file, const std::string& model, - int num_samples, int batchsize) { - InferenceInfo inference_info(read_json_file(config_file)); - int batch_size = batchsize; - int dense_dim = inference_info.dense_dim; - int slot_num = inference_info.slot_num[0]; - int max_feature_num_per_sample = inference_info.max_feature_num_per_sample[0]; - int max_nnz = max_feature_num_per_sample / slot_num; - num_samples = num_samples < batch_size ? num_samples : batch_size; - HostAllocator host_allocator; - HugeCTR::Timer timer_inference; - - // h_row_ptrs - std::vector row_ptrs_dims = {static_cast(batch_size * slot_num + 1)}; // 1D - size_t row_ptrs_size = 1; - for (auto dim : row_ptrs_dims) { - row_ptrs_size *= dim; - } - std::unique_ptr h_row_ptrs(new int[row_ptrs_size]); - std::shared_ptr> ldata_sim; - ldata_sim.reset(new IntUniformDataSimulator(1, max_nnz)); - h_row_ptrs[0] = 0; - for (size_t i = 1; i < row_ptrs_size; i++) { - h_row_ptrs[i] = (h_row_ptrs[i - 1] + ldata_sim->get_num()); - } - - // h_dense_features - const size_t dense_size = batch_size * dense_dim; - std::unique_ptr h_dense(new float[dense_size]); - FloatUniformDataSimulator fdata_sim(0, 1); - for (size_t i = 0; i < dense_size; i++) { - h_dense[i] = fdata_sim.get_num(); - } - - // h_embeddingcolumns - size_t embeddingcolumns_size = batch_size * max_feature_num_per_sample; - size_t embeddingcolumns_size_in_bytes = embeddingcolumns_size * sizeof(TypeHashKey); - void* h_embeddingcolumns = host_allocator.allocate(embeddingcolumns_size_in_bytes); - TypeHashKey* h_keys = reinterpret_cast(h_embeddingcolumns); - for (int i = 0; i < num_samples; i++) { - for (int j = 0; j < slot_num; j++) { - ldata_sim.reset(new IntUniformDataSimulator(RANGE[j], RANGE[j + 1] - 1)); - h_keys[i * slot_num + j] = static_cast(ldata_sim->get_num()); - } - } - - std::unique_ptr h_out(new float[batch_size]); - - // inference session - std::string dense_model{"/hugectr/test/utest/_dense_10000.model"}; - std::vector sparse_models{"/hugectr/test/utest/0_sparse_10000.model"}; - InferenceParams infer_param(model, batchsize, 0.5, dense_model, sparse_models, 0, true, 0.8, - false); - std::vector inference_params{infer_param}; - std::vector model_config_path{config_file}; - parameter_server_config ps_config{model_config_path, inference_params}; - std::shared_ptr parameter_server = - HierParameterServerBase::create(ps_config); - InferenceSessionCPU sess(model_config_path[0], inference_params[0], - parameter_server); - timer_inference.start(); - sess.predict(h_dense.get(), h_embeddingcolumns, h_row_ptrs.get(), h_out.get(), num_samples); - timer_inference.stop(); - - { - auto log = HCTR_LOG_S(INFO, WORLD); - log << "==========================prediction result===================" << std::endl; - for (int i = 0; i < num_samples; i++) { - log << h_out[i] << " "; - } - log << std::endl; - } - HCTR_LOG_S(INFO, ROOT) << "Batch size: " << batch_size << ", Number samples: " << num_samples - << ", Time: " << timer_inference.elapsedSeconds() << "s" << std::endl; - host_allocator.deallocate(h_embeddingcolumns); -} - -} // namespace - -TEST(session_inference_cpu, criteo_dcn) { - session_inference_criteo_test("/workdir/test/utest/simple_inference_config.json", - "DCN", "/hugectr/test/utest/dcn_csr.txt", 32); -} -TEST(session_inference_cpu, generated_dcn_32) { - session_inference_generated_test("/workdir/test/utest/simple_inference_config.json", - "DCN", 32, 32); -} \ No newline at end of file diff --git a/test/utest/inference/cpu_multicross_layer_test.cpp b/test/utest/inference/cpu_multicross_layer_test.cpp deleted file mode 100644 index aff3546c2f..0000000000 --- a/test/utest/inference/cpu_multicross_layer_test.cpp +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include - -using namespace HugeCTR; - -class MultiCrossLayerCPUTest { - private: - const float eps = 1; - const size_t batchsize_; - const size_t w_; - const int layers_; - std::shared_ptr> blob_buf_; - std::shared_ptr> weight_buf_; - std::shared_ptr> wgrad_buf_; - - Tensor2 weight_; - Tensor2 wgrad_; - - Tensor2 input_; - Tensor2 output_; - - std::vector h_input_; - std::vector h_input_grad_; - std::vector h_output_grad_; - std::vector> h_kernels_; - std::vector> h_biases_; - - std::vector> h_outputs_; - std::vector> h_hiddens_; - - std::vector> h_kernel_grads_; - std::vector> h_bias_grads_; - - std::shared_ptr layer_; - test::GaussianDataSimulator data_sim_; - - void reset_forward_() { - data_sim_.fill(h_input_.data(), batchsize_ * w_); - for (auto& a : h_kernels_) { - data_sim_.fill(a.data(), w_); - } - for (auto& a : h_biases_) { - data_sim_.fill(a.data(), w_); - } - memcpy(input_.get_ptr(), h_input_.data(), input_.get_size_in_bytes()); - - float* p = weight_.get_ptr(); - for (int i = 0; i < layers_; i++) { - memcpy(p, h_kernels_[i].data(), w_ * sizeof(float)); - p += w_; - memcpy(p, h_biases_[i].data(), w_ * sizeof(float)); - p += w_; - } - return; - } - - void matrix_vec_mul(float* out, const float* in_m, const float* in_v, size_t h, size_t w) { - for (size_t j = 0; j < h; j++) { - out[j] = 0.0f; - for (size_t i = 0; i < w; i++) { - size_t k = j * w + i; - out[j] += in_m[k] * in_v[i]; - } - } - } - - void row_scaling(float* out, const float* in_m, const float* in_v, size_t h, size_t w) { - for (size_t j = 0; j < h; j++) { - for (size_t i = 0; i < w; i++) { - size_t k = j * w + i; - out[k] = in_m[k] * in_v[j]; - } - } - } - - void matrix_add(float* out, const float* in_m_1, const float* in_m_2, size_t h, size_t w) { - for (size_t j = 0; j < h; j++) { - for (size_t i = 0; i < w; i++) { - size_t k = j * w + i; - out[k] = in_m_1[k] + in_m_2[k]; - } - } - } - - void matrix_vec_add(float* out, const float* in_m, const float* in_v, size_t h, size_t w) { - for (size_t j = 0; j < h; j++) { - for (size_t i = 0; i < w; i++) { - size_t k = j * w + i; - out[k] = in_m[k] + in_v[i]; - } - } - } - - void cpu_fprop_() { - for (int i = 0; i < layers_; i++) { - matrix_vec_mul(h_hiddens_[i].data(), i == 0 ? h_input_.data() : h_outputs_[i - 1].data(), - h_kernels_[i].data(), batchsize_, w_); - row_scaling(h_outputs_[i].data(), h_input_.data(), h_hiddens_[i].data(), batchsize_, w_); - matrix_add(h_outputs_[i].data(), h_outputs_[i].data(), - i == 0 ? h_input_.data() : h_outputs_[i - 1].data(), batchsize_, w_); - matrix_vec_add(h_outputs_[i].data(), h_outputs_[i].data(), h_biases_[i].data(), batchsize_, - w_); - } - } - - void layer_fprop_() { - layer_->fprop(false); - return; - } - - void compare_forward_() { - std::vector d2h_output; - d2h_output.resize(batchsize_ * w_); - - memcpy(d2h_output.data(), output_.get_ptr(), output_.get_size_in_bytes()); - - // todo compare - for (size_t i = 0; i < h_outputs_.back().size(); i++) { - if (abs(d2h_output[i] - h_outputs_.back()[i]) > 0.05f) { - HCTR_OWN_THROW(Error_t::WrongInput, "cpu multicross layer wrong result"); - } - } - } - - public: - MultiCrossLayerCPUTest(size_t batchsize, size_t w, int layers) - : batchsize_(batchsize), - w_(w), - layers_(layers), - blob_buf_(GeneralBuffer2::create()), - data_sim_(0.0f, 1.0f) { - weight_buf_ = blob_buf_->create_block(); - wgrad_buf_ = blob_buf_->create_block(); - - blob_buf_->reserve({batchsize, w}, &input_); - blob_buf_->reserve({batchsize, w}, &output_); - - h_input_.resize(batchsize * w); - h_output_grad_.resize(batchsize * w); - h_input_grad_.resize(batchsize * w); - - for (int i = 0; i < layers_; i++) { - h_kernels_.push_back(std::vector(1 * w)); - h_biases_.push_back(std::vector(1 * w)); - h_outputs_.push_back(std::vector(batchsize * w)); - h_hiddens_.push_back(std::vector(batchsize * 1)); - h_kernel_grads_.push_back(std::vector(1 * w)); - h_bias_grads_.push_back(std::vector(1 * w)); - } - - // layer - layer_.reset( - new MultiCrossLayerCPU(weight_buf_, wgrad_buf_, blob_buf_, input_, output_, layers)); - - blob_buf_->allocate(); - layer_->initialize(); - - weight_ = weight_buf_->as_tensor(); - wgrad_ = wgrad_buf_->as_tensor(); - - return; - } - - void test() { - reset_forward_(); - cpu_fprop_(); - layer_fprop_(); - compare_forward_(); - } -}; - -TEST(multi_cross_layer_cpu, fp32_1x4x1) { - MultiCrossLayerCPUTest test(1, 4, 1); - test.test(); -} - -TEST(multi_cross_layer_cpu, fp32_1x1024x2) { - MultiCrossLayerCPUTest test(1, 1024, 2); - test.test(); -} - -TEST(multi_cross_layer_cpu, fp32_1x1024x3) { - MultiCrossLayerCPUTest test(1, 1024, 3); - test.test(); -} - -TEST(multi_cross_layer_cpu, fp32_32x1024x3) { - MultiCrossLayerCPUTest test(32, 1024, 3); - test.test(); -} - -// TEST(multi_cross_layer_cpu, fp32_4096x1024x2) { -// MultiCrossLayerCPUTest test(4096, 1024, 2); -// test.test(); -// } - -// TEST(multi_cross_layer_cpu, fp32_4096x1024x3) { -// MultiCrossLayerCPUTest test(4096, 1024, 3); -// test.test(); -// } - -// TEST(multi_cross_layer_cpu, fp32_40963x356x3) { -// MultiCrossLayerCPUTest test(40963, 356, 3); -// test.test(); -// } diff --git a/test/utest/inference/embedding_feature_combiner_test.cpp b/test/utest/inference/embedding_feature_combiner_test.cpp deleted file mode 100644 index 2cbd7d36a9..0000000000 --- a/test/utest/inference/embedding_feature_combiner_test.cpp +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -using namespace HugeCTR; - -namespace { - -const float eps = 1e-2f; - -template -void embedding_feature_combine_cpu(const float* input, TypeEmbedding* output, const int* row_ptrs, - int batch_size, int slot_num, int embedding_vec_size, - EmbeddingFeatureCombiner_t combiner_type) { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < slot_num; j++) { - int feature_row_index = i * slot_num + j; - int row_offset = row_ptrs[feature_row_index]; // row offset within input - int feature_num = - row_ptrs[feature_row_index + 1] - row_offset; // num of feature vectors in one slot - - for (int k = 0; k < embedding_vec_size; k++) { - float tmp = 0.0f; - for (int l = 0; l < feature_num; l++) { - tmp += input[(row_offset + l) * embedding_vec_size + k]; - } // end for l - if (combiner_type == EmbeddingFeatureCombiner_t::Mean) tmp /= feature_num; - output[feature_row_index * embedding_vec_size + k] = tmp; - } // end for k - } // end for j - } // end for i -} - -template <> -void embedding_feature_combine_cpu(const float* input, __half* output, const int* row_ptrs, - int batch_size, int slot_num, int embedding_vec_size, - EmbeddingFeatureCombiner_t combiner_type) { - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < slot_num; j++) { - int feature_row_index = i * slot_num + j; - int row_offset = row_ptrs[feature_row_index]; // row offset within input - int feature_num = - row_ptrs[feature_row_index + 1] - row_offset; // num of feature vectors in one slot - - for (int k = 0; k < embedding_vec_size; k++) { - float tmp = 0.0f; - for (int l = 0; l < feature_num; l++) { - tmp += __half2float(input[(row_offset + l) * embedding_vec_size + k]); - } // end for l - if (combiner_type == EmbeddingFeatureCombiner_t::Mean && feature_num > 1) { - tmp /= feature_num; - } - output[feature_row_index * embedding_vec_size + k] = __float2half(tmp); - } // end for k - } // end for j - } // end for i -} - -template -void embedding_feature_combine_test(int batch_size, int slot_num, int embedding_vec_size, - int max_nnz, EmbeddingFeatureCombiner_t combiner_type) { - std::shared_ptr> buff = GeneralBuffer2::create(); - - core23::Device device_gpu(core23::DeviceType::GPU, 0); - core23::TensorParams tensor_params = core23::TensorParams().device(device_gpu); - std::shared_ptr row_ptrs_tensor_new = std::make_shared( - tensor_params.shape({static_cast(batch_size * slot_num + 1)}) - .data_type(core23::ScalarType::Int32)); - - size_t row_ptrs_size = row_ptrs_tensor_new->shape().size(); - std::unique_ptr h_row_ptrs(new int[row_ptrs_size]); - std::shared_ptr> ldata_sim; - ldata_sim.reset(new IntUniformDataSimulator(0, max_nnz)); - h_row_ptrs[0] = 0; - for (size_t i = 1; i < row_ptrs_size; i++) { - h_row_ptrs[i] = (h_row_ptrs[i - 1] + ldata_sim->get_num()); - } - - size_t feature_num = h_row_ptrs[row_ptrs_size - 1]; - std::shared_ptr in_tensor_new = std::make_shared( - tensor_params - .shape({static_cast(feature_num), static_cast(embedding_vec_size)}) - .data_type(core23::ScalarType::Float)); - - Tensor2 out_tensor; - test::GaussianDataSimulator simulator(0.0f, 1.0f); - EmbeddingFeatureCombiner embedding_feature_combiner( - in_tensor_new, row_ptrs_tensor_new, out_tensor, batch_size, slot_num, combiner_type, buff, - test::get_default_gpu()); - buff->allocate(); - size_t in_size = in_tensor_new->shape().size(); - auto out_dims = out_tensor.get_dimensions(); - size_t out_size = 1; - for (auto dim : out_dims) { - out_size *= dim; - } - int* d_row_ptrs = row_ptrs_tensor_new->data(); - float* d_in = in_tensor_new->data(); - TypeEmbedding* d_out = out_tensor.get_ptr(); - std::unique_ptr h_in(new float[in_size]); - std::unique_ptr h_out(new TypeEmbedding[out_size]); - std::unique_ptr h_cpu_out(new TypeEmbedding[out_size]); - - // fprop - simulator.fill(h_in.get(), in_size); - HCTR_LIB_THROW(cudaMemcpy(d_in, h_in.get(), in_size * sizeof(float), cudaMemcpyHostToDevice)); - HCTR_LIB_THROW(cudaMemcpy(d_row_ptrs, h_row_ptrs.get(), row_ptrs_size * sizeof(int), - cudaMemcpyHostToDevice)); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - embedding_feature_combiner.fprop(false); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - HCTR_LIB_THROW( - cudaMemcpy(h_out.get(), d_out, out_size * sizeof(TypeEmbedding), cudaMemcpyDeviceToHost)); - - embedding_feature_combine_cpu(h_in.get(), h_cpu_out.get(), h_row_ptrs.get(), batch_size, slot_num, - embedding_vec_size, combiner_type); - ASSERT_TRUE( - test::compare_array_approx(h_out.get(), h_cpu_out.get(), out_size, eps)); -} - -} // namespace - -TEST(embedding_feature_combiner, fp32_10x1x64_10_Sum) { - embedding_feature_combine_test(10, 1, 64, 10, EmbeddingFeatureCombiner_t::Sum); -} -TEST(embedding_feature_combiner, fp32_10x10x64_1_Sum) { - embedding_feature_combine_test(10, 10, 64, 1, EmbeddingFeatureCombiner_t::Sum); -} -TEST(embedding_feature_combiner, fp32_4096x26x64_1_Sum) { - embedding_feature_combine_test(4096, 26, 64, 1, EmbeddingFeatureCombiner_t::Sum); -} -TEST(embedding_feature_combiner, fp32_4096x26x64_3_Sum) { - embedding_feature_combine_test(4096, 26, 64, 3, EmbeddingFeatureCombiner_t::Sum); -} -TEST(embedding_feature_combiner, fp32_10x1x64_10_Mean) { - embedding_feature_combine_test(10, 1, 64, 10, EmbeddingFeatureCombiner_t::Mean); -} -TEST(embedding_feature_combiner, fp32_10x10x64_1_Mean) { - embedding_feature_combine_test(10, 10, 64, 1, EmbeddingFeatureCombiner_t::Mean); -} -TEST(embedding_feature_combiner, fp32_4096x26x64_1_Mean) { - embedding_feature_combine_test(4096, 26, 64, 1, EmbeddingFeatureCombiner_t::Mean); -} -TEST(embedding_feature_combiner, fp32_4096x26x64_3_Mean) { - embedding_feature_combine_test(4096, 26, 64, 3, EmbeddingFeatureCombiner_t::Mean); -} -TEST(embedding_feature_combiner, fp16_10x1x64_10_Sum) { - embedding_feature_combine_test<__half>(10, 1, 64, 10, EmbeddingFeatureCombiner_t::Sum); -} -TEST(embedding_feature_combiner, fp16_10x10x64_1_Sum) { - embedding_feature_combine_test<__half>(10, 10, 64, 1, EmbeddingFeatureCombiner_t::Sum); -} -TEST(embedding_feature_combiner, fp16_4096x26x64_1_Sum) { - embedding_feature_combine_test<__half>(4096, 26, 64, 1, EmbeddingFeatureCombiner_t::Sum); -} -TEST(embedding_feature_combiner, fp16_4096x26x64_3_Sum) { - embedding_feature_combine_test<__half>(4096, 26, 64, 3, EmbeddingFeatureCombiner_t::Sum); -} -TEST(embedding_feature_combiner, fp16_10x1x64_10_Mean) { - embedding_feature_combine_test<__half>(10, 1, 64, 10, EmbeddingFeatureCombiner_t::Mean); -} -TEST(embedding_feature_combiner, fp16_10x10x64_1_Mean) { - embedding_feature_combine_test<__half>(10, 10, 64, 1, EmbeddingFeatureCombiner_t::Mean); -} -TEST(embedding_feature_combiner, fp16_4096x26x64_1_Mean) { - embedding_feature_combine_test<__half>(4096, 26, 64, 1, EmbeddingFeatureCombiner_t::Mean); -} -TEST(embedding_feature_combiner, fp16_4096x26x64_3_Mean) { - embedding_feature_combine_test<__half>(4096, 26, 64, 3, EmbeddingFeatureCombiner_t::Mean); -} diff --git a/test/utest/inference/preallocated_buffer2_test.cpp b/test/utest/inference/preallocated_buffer2_test.cpp deleted file mode 100644 index 249980d135..0000000000 --- a/test/utest/inference/preallocated_buffer2_test.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; - -namespace { - -const float eps = 1e-6f; - -void preallocated_buffer2_test(int batch_size, int slot_num, int embedding_vec_size, int max_nnz) { - CudaAllocator allocator; - - // row_ptrs: h_row_ptrs, d_row_ptrs, row_ptrs_tensor - std::vector row_ptrs_dims = {static_cast(batch_size * slot_num + 1)}; // 1D - size_t row_ptrs_size = 1; - for (auto dim : row_ptrs_dims) { - row_ptrs_size *= dim; - } - std::unique_ptr h_row_ptrs(new int[row_ptrs_size]); - std::shared_ptr> ldata_sim; - ldata_sim.reset(new IntUniformDataSimulator(0, max_nnz)); - h_row_ptrs[0] = 0; - for (size_t i = 1; i < row_ptrs_size; i++) { - h_row_ptrs[i] = h_row_ptrs[i - 1] + ldata_sim->get_num(); - } - - size_t row_ptrs_size_in_bytes = row_ptrs_size * TensorScalarSizeFunc::get_element_size(); - void* d_row_ptrs = allocator.allocate(row_ptrs_size_in_bytes); - HCTR_LIB_THROW( - cudaMemcpy(d_row_ptrs, h_row_ptrs.get(), row_ptrs_size_in_bytes, cudaMemcpyHostToDevice)); - std::shared_ptr> row_ptrs_tensor = - std::make_shared>(row_ptrs_dims, nullptr); - - HCTR_LOG(INFO, ROOT, "Bind the tensor to preallocated buffer for the first time\n"); - std::shared_ptr row_ptrs_buff = - PreallocatedBuffer2::create(d_row_ptrs, row_ptrs_dims); - bind_tensor_to_buffer(row_ptrs_dims, row_ptrs_buff, row_ptrs_tensor); - - // embedding_features: h_embedding_features, d_embedding_features, embedding_features_tensor - size_t feature_num = h_row_ptrs[row_ptrs_size - 1]; - std::vector embedding_features_dims = {static_cast(feature_num), - static_cast(embedding_vec_size)}; - size_t embedding_features_size = 1; - for (auto dim : embedding_features_dims) { - embedding_features_size *= dim; - } - size_t embedding_features_size_in_bytes = - embedding_features_size * TensorScalarSizeFunc::get_element_size(); - std::unique_ptr h_embedding_features(new float[embedding_features_size]); - test::GaussianDataSimulator simulator(0.0f, 1.0f); - simulator.fill(h_embedding_features.get(), embedding_features_size); - void* d_embedding_features = allocator.allocate(embedding_features_size_in_bytes); - HCTR_LIB_THROW(cudaMemcpy(d_embedding_features, h_embedding_features.get(), - embedding_features_size_in_bytes, cudaMemcpyHostToDevice)); - - std::shared_ptr> embedding_features_tensor = - std::make_shared>(embedding_features_dims, nullptr); - std::shared_ptr embeddding_features_buff = - PreallocatedBuffer2::create(d_embedding_features, embedding_features_dims); - bind_tensor_to_buffer(embedding_features_dims, embeddding_features_buff, - embedding_features_tensor); - - // copy Tensor2 back to cpu and compare with original buffer - std::unique_ptr h_row_ptrs_back(new int[row_ptrs_size]); - std::unique_ptr h_embedding_features_back(new float[embedding_features_size]); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - HCTR_LIB_THROW(cudaMemcpy(h_row_ptrs_back.get(), row_ptrs_tensor->get_ptr(), - row_ptrs_size_in_bytes, cudaMemcpyDeviceToHost)); - HCTR_LIB_THROW(cudaMemcpy(h_embedding_features_back.get(), embedding_features_tensor->get_ptr(), - embedding_features_size_in_bytes, cudaMemcpyDeviceToHost)); - ASSERT_TRUE( - test::compare_array_approx(h_row_ptrs.get(), h_row_ptrs_back.get(), row_ptrs_size, eps)); - ASSERT_TRUE(test::compare_array_approx( - h_embedding_features.get(), h_embedding_features_back.get(), embedding_features_size, eps)); - - HCTR_LOG(INFO, ROOT, "Bind the tensor to preallocated buffer for the second time\n"); - void* d_row_ptrs2 = allocator.allocate(row_ptrs_size_in_bytes); - HCTR_LIB_THROW( - cudaMemcpy(d_row_ptrs2, h_row_ptrs.get(), row_ptrs_size_in_bytes, cudaMemcpyHostToDevice)); - - std::shared_ptr row_ptrs_buff2 = - PreallocatedBuffer2::create(d_row_ptrs2, row_ptrs_dims); - bind_tensor_to_buffer(row_ptrs_dims, row_ptrs_buff2, row_ptrs_tensor); - - // embedding_features: h_embedding_features, d_embedding_features, embedding_features_tensor - void* d_embedding_features2 = allocator.allocate(embedding_features_size_in_bytes); - HCTR_LIB_THROW(cudaMemcpy(d_embedding_features2, h_embedding_features.get(), - embedding_features_size_in_bytes, cudaMemcpyHostToDevice)); - - std::shared_ptr embeddding_features_buff2 = - PreallocatedBuffer2::create(d_embedding_features2, embedding_features_dims); - bind_tensor_to_buffer(embedding_features_dims, embeddding_features_buff2, - embedding_features_tensor); - - // copy Tensor2 back to cpu and compare with original buffer - HCTR_LIB_THROW(cudaDeviceSynchronize()); - HCTR_LIB_THROW(cudaMemcpy(h_row_ptrs_back.get(), row_ptrs_tensor->get_ptr(), - row_ptrs_size_in_bytes, cudaMemcpyDeviceToHost)); - HCTR_LIB_THROW(cudaMemcpy(h_embedding_features_back.get(), embedding_features_tensor->get_ptr(), - embedding_features_size_in_bytes, cudaMemcpyDeviceToHost)); - ASSERT_TRUE( - test::compare_array_approx(h_row_ptrs.get(), h_row_ptrs_back.get(), row_ptrs_size, eps)); - ASSERT_TRUE(test::compare_array_approx( - h_embedding_features.get(), h_embedding_features_back.get(), embedding_features_size, eps)); - // deallocate: d_row_ptrs2, d_embedding_features2 - allocator.deallocate(d_row_ptrs); - allocator.deallocate(d_embedding_features); - allocator.deallocate(d_row_ptrs2); - allocator.deallocate(d_embedding_features2); -} - -} // namespace - -TEST(preallocated_buffer2, fp32_10x1x64_10) { preallocated_buffer2_test(10, 1, 64, 10); } -TEST(preallocated_buffer2, fp32_10x10x64_1) { preallocated_buffer2_test(10, 10, 64, 1); } -TEST(preallocated_buffer2, fp32_4096x26x64_1) { preallocated_buffer2_test(4096, 26, 64, 1); } -TEST(preallocated_buffer2, fp32_4096x26x64_3) { preallocated_buffer2_test(4096, 26, 64, 3); } \ No newline at end of file diff --git a/test/utest/inference/session_inference_test.cpp b/test/utest/inference/session_inference_test.cpp deleted file mode 100644 index 717d546b2e..0000000000 --- a/test/utest/inference/session_inference_test.cpp +++ /dev/null @@ -1,383 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; - -namespace { - -const int RANGE[] = {0, 1460, 2018, 337396, 549106, 549411, 549431, - 561567, 562200, 562203, 613501, 618803, 951403, 954582, - 954609, 966800, 1268011, 1268021, 1272862, 1274948, 1274952, - 1599225, 1599242, 1599257, 1678991, 1679087, 1737709}; - -std::vector& split(const std::string& s, char delim, std::vector& elems) { - std::istringstream is(s); - std::string item; - while (std::getline(is, item, delim)) { - elems.push_back(item); - } - return elems; -} - -struct InferenceInfo { - int dense_dim; - std::vector slot_num; - std::vector max_feature_num_per_sample; - std::vector embedding_vec_size; - std::vector combiner_type; - InferenceInfo(const nlohmann::json& config); -}; - -InferenceInfo::InferenceInfo(const nlohmann::json& config) { - auto j_layers_array = get_json(config, "layers"); - const nlohmann::json& j_data = j_layers_array[0]; - auto j_dense = get_json(j_data, "dense"); - dense_dim = get_value_from_json(j_dense, "dense_dim"); - auto j_sparse_inputs = get_json(j_data, "sparse"); - - for (size_t i = 0; i < j_sparse_inputs.size(); i++) { - const nlohmann::json& j_sparse = j_sparse_inputs[0]; - slot_num.push_back(get_value_from_json(j_sparse, "slot_num")); - - size_t max_feature_num_per_sample_ = - static_cast(get_max_feature_num_per_sample_from_nnz_per_slot(j_sparse)); - - max_feature_num_per_sample.push_back(max_feature_num_per_sample_); - } - // get embedding params: embedding_vec_size, combiner_type - { - for (size_t i = 1; i < j_layers_array.size(); i++) { - // if not embedding then break - const nlohmann::json& j = j_layers_array[i]; - auto embedding_name = get_value_from_json(j, "type"); - if (embedding_name.compare("DistributedSlotSparseEmbeddingHash") != 0 && - embedding_name.compare("LocalizedSlotSparseEmbeddingHash") != 0 && - embedding_name.compare("LocalizedSlotSparseEmbeddingOneHot") != 0) { - break; - } - auto j_embed_params = get_json(j, "sparse_embedding_hparam"); - auto vec_size = get_value_from_json(j_embed_params, "embedding_vec_size"); - auto combiner_str = get_value_from_json(j_embed_params, "combiner"); - embedding_vec_size.push_back(vec_size); - if (combiner_str == "mean") { - combiner_type.push_back(HugeCTR::EmbeddingFeatureCombiner_t::Mean); - } else { - combiner_type.push_back(HugeCTR::EmbeddingFeatureCombiner_t::Sum); - } - } // for () - } // get embedding params -} - -template -void session_inference_criteo_test(const std::string& config_file, const std::string& model, - const std::string& criteo_data_path, int batchsize) { - InferenceInfo inference_info(read_json_file(config_file)); - int batch_size = batchsize; - int dense_dim = inference_info.dense_dim; - int slot_num = inference_info.slot_num[0]; - int max_feature_num_per_sample = inference_info.max_feature_num_per_sample[0]; - int num_samples = 0; - std::vector labels; - std::vector dense_features; - std::vector keys; - std::vector row_ptrs; - CudaAllocator allocator; - CudaHostAllocator host_allocator; - HugeCTR::Timer timer_inference; - - // open criteo data file - std::ifstream criteo_data_file(criteo_data_path, std::ifstream::binary); - if (!criteo_data_file.is_open()) { - HCTR_LOG_S(ERROR, WORLD) << "Cannot open " << criteo_data_path << std::endl; - } - - // 4 lines: labels, dense_features, keys, row_ptrs - for (int i = 0; i < 4; i++) { - std::string line; - std::getline(criteo_data_file, line); - std::vector vec_string; - split(line, ' ', vec_string); - switch (i) { - case 0: { - num_samples = static_cast(vec_string.size()); - for (int j = 0; j < num_samples; j++) { - int label = std::stoi(vec_string[j]); - labels.push_back(label); - } - break; - } - case 1: { - int dense_features_dim = static_cast(vec_string.size()); - if (dense_features_dim != num_samples * dense_dim) { - HCTR_LOG_S(ERROR, WORLD) - << "dense_features_dim does not equal to num_samples*dense_dim" << std::endl; - } - for (int j = 0; j < dense_features_dim; j++) { - float dense_feature = std::stod(vec_string[j]); - dense_features.push_back(dense_feature); - } - break; - } - case 2: { - int keys_dim = static_cast(vec_string.size()); - if (keys_dim != num_samples * slot_num) { - HCTR_LOG_S(ERROR, WORLD) - << "keys_dim does not equal to num_samples*slot_num" << std::endl; - } - for (int j = 0; j < keys_dim; j++) { - TypeHashKey key = static_cast(std::stoll(vec_string[j])); - keys.push_back(key); - } - break; - } - case 3: { - int row_ptrs_dim = static_cast(vec_string.size()); - if (row_ptrs_dim != num_samples * slot_num + 1) { - HCTR_LOG_S(ERROR, WORLD) - << "row_ptrs_dim does not equal to num_samples*slot_num + 1" << std::endl; - } - for (int j = 0; j < row_ptrs_dim; j++) { - int row_ptr = std::stoi(vec_string[j]); - row_ptrs.push_back(row_ptr); - } - break; - } - default: { - assert(!"Error: Should never get here!"); - } - } - } - - if (batch_size == 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "batch size should not be zero!"); - } - num_samples = num_samples < batch_size ? num_samples : batch_size; - - // d_row_ptrs - std::vector row_ptrs_dims = {static_cast(batch_size * slot_num + 1)}; // 1D - size_t row_ptrs_size = 1; - for (auto dim : row_ptrs_dims) { - row_ptrs_size *= dim; - } - size_t row_ptrs_size_samples = num_samples * slot_num + 1; - size_t row_ptrs_size_in_bytes = row_ptrs_size * sizeof(int); - size_t row_ptrs_size_in_bytes_samples = row_ptrs_size_samples * sizeof(int); - int* d_row_ptrs = reinterpret_cast(allocator.allocate(row_ptrs_size_in_bytes)); - - // d_dense_features - size_t dense_size = batch_size * dense_dim; - size_t dense_size_samples = num_samples * dense_dim; - size_t dense_size_in_bytes = dense_size * sizeof(float); - size_t dense_size_in_bytes_samples = dense_size_samples * sizeof(float); - float* d_dense_features = reinterpret_cast(allocator.allocate(dense_size_in_bytes)); - - // h_embeddingcolumns - size_t embeddingcolumns_size = batch_size * max_feature_num_per_sample; - size_t embeddingcolumns_size_samples = num_samples * max_feature_num_per_sample; - size_t embeddingcolumns_size_in_bytes = embeddingcolumns_size * sizeof(TypeHashKey); - size_t embeddingcolumns_size_in_bytes_samples = - embeddingcolumns_size_samples * sizeof(TypeHashKey); - void* h_embeddingcolumns = host_allocator.allocate(embeddingcolumns_size_in_bytes); - // TypeHashKey* h_keys = reinterpret_cast(h_embeddingcolumns); - - // d_output - float* d_output = reinterpret_cast(allocator.allocate(batch_size * sizeof(float))); - std::unique_ptr h_out(new float[num_samples]); - - // memory copy - HCTR_LIB_THROW(cudaMemcpy(d_row_ptrs, row_ptrs.data(), row_ptrs_size_in_bytes_samples, - cudaMemcpyHostToDevice)); - HCTR_LIB_THROW(cudaMemcpy(d_dense_features, dense_features.data(), dense_size_in_bytes_samples, - cudaMemcpyHostToDevice)); - memcpy(h_embeddingcolumns, keys.data(), embeddingcolumns_size_in_bytes_samples); - - // inference session - std::string dense_model{"/hugectr/test/utest/_dense_10000.model"}; - std::vector sparse_models{"/hugectr/test/utest/0_sparse_10000.model"}; - InferenceParams infer_param(model, batchsize, 0.5, dense_model, sparse_models, 0, true, 0.8, - false); - std::vector inference_params{infer_param}; - std::vector model_config_path{config_file}; - parameter_server_config ps_config{model_config_path, inference_params}; - std::shared_ptr parameter_server = - HierParameterServerBase::create(ps_config); - auto embedding_cache = parameter_server->get_embedding_cache(inference_params[0].model_name, - inference_params[0].device_id); - InferenceSession sess(model_config_path[0], inference_params[0], embedding_cache); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - timer_inference.start(); - sess.predict(d_dense_features, h_embeddingcolumns, d_row_ptrs, d_output, num_samples); - timer_inference.stop(); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - HCTR_LIB_THROW( - cudaMemcpy(h_out.get(), d_output, num_samples * sizeof(float), cudaMemcpyDeviceToHost)); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - - { - auto log = HCTR_LOG_S(INFO, WORLD); - log << "==========================labels===================" << std::endl; - for (int i = 0; i < num_samples; i++) { - log << labels[i] << " "; - } - log << std::endl; - } - { - auto log = HCTR_LOG_S(INFO, WORLD); - log << "==========================prediction result===================" << std::endl; - for (int i = 0; i < num_samples; i++) { - log << h_out[i] << " "; - } - log << std::endl; - } - - HCTR_LOG_S(INFO, ROOT) << "Batch size: " << batch_size << ", Number samples: " << num_samples - << ", Time: " << timer_inference.elapsedSeconds() << "s" << std::endl; - - host_allocator.deallocate(h_embeddingcolumns); - allocator.deallocate(d_row_ptrs); - allocator.deallocate(d_dense_features); - allocator.deallocate(d_output); -} - -template -void session_inference_generated_test(const std::string& config_file, const std::string& model, - int num_samples, int batchsize) { - InferenceInfo inference_info(read_json_file(config_file)); - int batch_size = batchsize; - int dense_dim = inference_info.dense_dim; - int slot_num = inference_info.slot_num[0]; - int max_feature_num_per_sample = inference_info.max_feature_num_per_sample[0]; - int max_nnz = max_feature_num_per_sample / slot_num; - num_samples = num_samples < batch_size ? num_samples : batch_size; - CudaAllocator allocator; - CudaHostAllocator host_allocator; - HugeCTR::Timer timer_inference; - - // d_row_ptrs - std::vector row_ptrs_dims = {static_cast(batch_size * slot_num + 1)}; // 1D - size_t row_ptrs_size = 1; - for (auto dim : row_ptrs_dims) { - row_ptrs_size *= dim; - } - const size_t row_ptrs_size_samples = num_samples * slot_num + 1; - std::unique_ptr h_row_ptrs(new int[row_ptrs_size_samples]); - std::shared_ptr> ldata_sim; - ldata_sim.reset(new IntUniformDataSimulator(1, max_nnz)); - h_row_ptrs[0] = 0; - for (size_t i = 1; i < row_ptrs_size_samples; i++) { - h_row_ptrs[i] = (h_row_ptrs[i - 1] + ldata_sim->get_num()); - } - const size_t row_ptrs_size_in_bytes = row_ptrs_size * sizeof(int); - const size_t row_ptrs_size_in_bytes_samples = row_ptrs_size_samples * sizeof(int); - int* const d_row_ptrs = reinterpret_cast(allocator.allocate(row_ptrs_size_in_bytes)); - - // d_dense_features - const size_t dense_size = batch_size * dense_dim; - const size_t dense_size_samples = num_samples * dense_dim; - std::unique_ptr h_dense(new float[dense_size_samples]); - FloatUniformDataSimulator fdata_sim(0, 1); - for (size_t i = 0; i < dense_size_samples; i++) { - h_dense[i] = fdata_sim.get_num(); - } - const size_t dense_size_in_bytes = dense_size * sizeof(float); - const size_t dense_size_in_bytes_samples = dense_size_samples * sizeof(float); - float* const d_dense_features = reinterpret_cast(allocator.allocate(dense_size_in_bytes)); - - // h_embeddingcolumns - const size_t embeddingcolumns_size = batch_size * max_feature_num_per_sample; - const size_t embeddingcolumns_size_in_bytes = embeddingcolumns_size * sizeof(TypeHashKey); - void* const h_embeddingcolumns = host_allocator.allocate(embeddingcolumns_size_in_bytes); - TypeHashKey* const h_keys = reinterpret_cast(h_embeddingcolumns); - for (int i = 0; i < num_samples; i++) { - for (int j = 0; j < slot_num; j++) { - ldata_sim.reset(new IntUniformDataSimulator(RANGE[j], RANGE[j + 1] - 1)); - h_keys[i * slot_num + j] = static_cast(ldata_sim->get_num()); - } - } - - // memory copy - HCTR_LIB_THROW(cudaMemcpy(d_row_ptrs, h_row_ptrs.get(), row_ptrs_size_in_bytes_samples, - cudaMemcpyHostToDevice)); - HCTR_LIB_THROW(cudaMemcpy(d_dense_features, h_dense.get(), dense_size_in_bytes_samples, - cudaMemcpyHostToDevice)); - - // d_output - float* const d_output = reinterpret_cast(allocator.allocate(batch_size * sizeof(float))); - std::unique_ptr h_out(new float[num_samples]); - - // inference session - std::string dense_model{"/hugectr/test/utest/_dense_10000.model"}; - std::vector sparse_models{"/hugectr/test/utest/0_sparse_10000.model"}; - InferenceParams infer_param(model, batchsize, 0.5, dense_model, sparse_models, 0, true, 0.8, - false); - std::vector inference_params{infer_param}; - std::vector model_config_path{config_file}; - - parameter_server_config ps_config{model_config_path, inference_params}; - std::shared_ptr parameter_server = - HierParameterServerBase::create(ps_config); - auto embedding_cache = parameter_server->get_embedding_cache(inference_params[0].model_name, - inference_params[0].device_id); - InferenceSession sess(model_config_path[0], inference_params[0], embedding_cache); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - timer_inference.start(); - sess.predict(d_dense_features, h_embeddingcolumns, d_row_ptrs, d_output, num_samples); - timer_inference.stop(); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - HCTR_LIB_THROW( - cudaMemcpy(h_out.get(), d_output, num_samples * sizeof(float), cudaMemcpyDeviceToHost)); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - - { - auto log = HCTR_LOG_S(INFO, WORLD); - log << "==========================prediction result===================" << std::endl; - for (int i = 0; i < num_samples; i++) { - log << h_out[i] << " "; - } - log << std::endl; - } - HCTR_LOG_S(INFO, ROOT) << "Batch size: " << batch_size << ", Number samples: " << num_samples - << ", Time: " << timer_inference.elapsedSeconds() << "s" << std::endl; - host_allocator.deallocate(h_embeddingcolumns); - allocator.deallocate(d_row_ptrs); - allocator.deallocate(d_dense_features); - allocator.deallocate(d_output); -} - -} // namespace - -TEST(session_inference, criteo_dcn) { - session_inference_criteo_test("/workdir/test/utest/simple_inference_config.json", - "DCN", "/hugectr/test/utest/dcn_csr.txt", 32); -} -TEST(session_inference, generated_dcn_32) { - session_inference_generated_test("/workdir/test/utest/simple_inference_config.json", - "DCN", 32, 32); -} \ No newline at end of file