diff --git a/CHANGELOG.md b/CHANGELOG.md index d996f82f85..6c09cafa5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -131,6 +131,7 @@ - PR #2257: Update QN and LogisticRegression to use CumlArray - PR #2259: Add CumlArray support to Naive Bayes - PR #2252: Add benchmark for the Gram matrix prims +- PR #2263: Faster serialization for Treelite objects with RF - PR #2264: Reduce build time for cuML by using make_blobs from libcuml++ interface - PR #2269: Add docs targets to build.sh and fix python cuml.common docs - PR #2271: Clarify doc for `_unique` default implementation in OneHotEncoder diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 9e0da4a90a..597d057669 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -97,7 +97,7 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED export LD_LIBRARY_PATH_CACHED="" logger "Install Treelite for GPU testing..." -python -m pip install -v treelite==0.91 +python -m pip install -v treelite==0.92 treelite_runtime==0.92 cd $WORKSPACE diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8fe6ea708d..15326d3481 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -284,7 +284,7 @@ set(CUML_INCLUDE_DIRECTORIES ${CUB_DIR}/src/cub ${SPDLOG_DIR}/src/spdlog/include ${TREELITE_DIR}/include - ${TREELITE_DIR}/include/fmt + ${TREELITE_DIR}/src/treelite/include ${RAFT_DIR}/cpp/include) set(CUML_LINK_LIBRARIES diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake index e26ead0bfd..9d62170262 100644 --- a/cpp/cmake/Dependencies.cmake +++ b/cpp/cmake/Dependencies.cmake @@ -126,28 +126,26 @@ set(TREELITE_DIR ${CMAKE_CURRENT_BINARY_DIR}/treelite CACHE STRING "Path to treelite install directory") ExternalProject_Add(treelite GIT_REPOSITORY https://github.com/dmlc/treelite.git - GIT_TAG 6fd01e4f1890950bbcf9b124da24e886751bffe6 + GIT_TAG 0.92 PREFIX ${TREELITE_DIR} - CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF - -DCMAKE_INSTALL_PREFIX= - -DENABLE_PROTOBUF=ON + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} - BUILD_BYPRODUCTS ${TREELITE_DIR}/lib/libtreelite.a - ${TREELITE_DIR}/lib/libdmlc.a - ${TREELITE_DIR}/lib/libtreelite_runtime.so - UPDATE_COMMAND "" - PATCH_COMMAND patch -p1 -N < ${CMAKE_CURRENT_SOURCE_DIR}/cmake/treelite_protobuf.patch || true) + -DENABLE_PROTOBUF=ON + BUILD_BYPRODUCTS ${TREELITE_DIR}/src/treelite-build/libtreelite_static.a + ${TREELITE_DIR}/src/treelite-build/_deps/dmlccore-build/libdmlc.a + ${TREELITE_DIR}/src/treelite-build/libtreelite_runtime.so + UPDATE_COMMAND "") add_library(dmlclib STATIC IMPORTED) add_library(treelitelib STATIC IMPORTED) add_library(treelite_runtimelib SHARED IMPORTED) set_property(TARGET dmlclib PROPERTY - IMPORTED_LOCATION ${TREELITE_DIR}/lib/libdmlc.a) + IMPORTED_LOCATION ${TREELITE_DIR}/src/treelite-build/_deps/dmlccore-build/libdmlc.a) set_property(TARGET treelitelib PROPERTY - IMPORTED_LOCATION ${TREELITE_DIR}/lib/libtreelite.a) + IMPORTED_LOCATION ${TREELITE_DIR}/src/treelite-build/libtreelite_static.a) set_property(TARGET treelite_runtimelib PROPERTY - IMPORTED_LOCATION ${TREELITE_DIR}/lib/libtreelite_runtime.so) + IMPORTED_LOCATION ${TREELITE_DIR}/src/treelite-build/libtreelite_runtime.so) add_dependencies(dmlclib treelite) add_dependencies(treelitelib treelite) diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp index 84c951dd0c..545c5decad 100644 --- a/cpp/include/cuml/ensemble/randomforest.hpp +++ b/cpp/include/cuml/ensemble/randomforest.hpp @@ -132,10 +132,7 @@ void print_rf_detailed(const RandomForestMetaData* forest); template void build_treelite_forest(ModelHandle* model, const RandomForestMetaData* forest, - int num_features, int task_category, - std::vector& data); - -std::vector save_model(ModelHandle model); + int num_features, int task_category); ModelHandle concatenate_trees(std::vector treelite_handles); diff --git a/cpp/src/decisiontree/decisiontree_impl.cuh b/cpp/src/decisiontree/decisiontree_impl.cuh index 8e4d552c4d..ae339700a5 100644 --- a/cpp/src/decisiontree/decisiontree_impl.cuh +++ b/cpp/src/decisiontree/decisiontree_impl.cuh @@ -96,7 +96,6 @@ void build_treelite_tree(TreeBuilderHandle tree_builder, int num_output_group) { int node_id = 0; TREELITE_CHECK(TreeliteTreeBuilderCreateNode(tree_builder, node_id)); - TREELITE_CHECK(TreeliteTreeBuilderSetRootNode(tree_builder, node_id)); std::queue> cur_level_queue; std::queue> next_level_queue; @@ -138,7 +137,7 @@ void build_treelite_tree(TreeBuilderHandle tree_builder, TREELITE_CHECK(TreeliteTreeBuilderSetLeafNode( tree_builder, q_node.unique_node_id, q_node.node.prediction)); } else { - std::vector leaf_vector(num_output_group); + std::vector leaf_vector(num_output_group); for (int j = 0; j < num_output_group; j++) { if (q_node.node.prediction == j) { leaf_vector[j] = 1; @@ -157,6 +156,7 @@ void build_treelite_tree(TreeBuilderHandle tree_builder, // The cur_level_queue is empty here, as all the elements are already poped out. cur_level_queue.swap(next_level_queue); } + TREELITE_CHECK(TreeliteTreeBuilderSetRootNode(tree_builder, 0)); } /** diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 7f07a98b47..5c0b7df2af 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -367,34 +367,17 @@ void check_params(const forest_params_t* params, bool dense) { } } -// tl_node_at is a checked version of tree[i] -inline const tl::Tree::Node& tl_node_at(const tl::Tree& tree, size_t i) { - ASSERT(i < tree.num_nodes, "node index out of range"); - return tree[i]; -} - int tree_root(const tl::Tree& tree) { - // find the root - int root = -1; - for (int i = 0; i < tree.num_nodes; ++i) { - if (tl_node_at(tree, i).is_root()) { - ASSERT(root == -1, "multi-root trees not supported"); - root = i; - } - } - ASSERT(root != -1, "a tree must have a root"); - return root; + return 0; // Treelite format assumes that the root is 0 } -int max_depth_helper(const tl::Tree& tree, const tl::Tree::Node& node, - int limit) { - if (node.is_leaf()) return 0; +int max_depth_helper(const tl::Tree& tree, int node_id, int limit) { + if (tree.IsLeaf(node_id)) return 0; ASSERT(limit > 0, "recursion depth limit reached, might be a cycle in the tree"); return 1 + - std::max( - max_depth_helper(tree, tl_node_at(tree, node.cleft()), limit - 1), - max_depth_helper(tree, tl_node_at(tree, node.cright()), limit - 1)); + std::max(max_depth_helper(tree, tree.LeftChild(node_id), limit - 1), + max_depth_helper(tree, tree.RightChild(node_id), limit - 1)); } inline int max_depth(const tl::Tree& tree) { @@ -402,18 +385,18 @@ inline int max_depth(const tl::Tree& tree) { // e.g. cycles in the forest const int DEPTH_LIMIT = 500; int root_index = tree_root(tree); - typedef std::pair pair_t; + typedef std::pair pair_t; std::stack stack; - stack.push(pair_t(&tl_node_at(tree, root_index), 0)); + stack.push(pair_t(root_index, 0)); int max_depth = 0; while (!stack.empty()) { const pair_t& pair = stack.top(); - const tl::Tree::Node* node = pair.first; + int node_id = pair.first; int depth = pair.second; stack.pop(); - while (!node->is_leaf()) { - stack.push(pair_t(&tl_node_at(tree, node->cleft()), depth + 1)); - node = &tl_node_at(tree, node->cright()); + while (!tree.IsLeaf(node_id)) { + stack.push(pair_t(tree.LeftChild(node_id), depth + 1)); + node_id = tree.RightChild(node_id); depth++; ASSERT(depth < DEPTH_LIMIT, "depth limit reached, might be a cycle in the tree"); @@ -431,12 +414,12 @@ int max_depth(const tl::Model& model) { } inline void adjust_threshold(float* pthreshold, int* tl_left, int* tl_right, - bool* default_left, const tl::Tree::Node& node) { + bool* default_left, tl::Operator comparison_op) { // in treelite (take left node if val [op] threshold), // the meaning of the condition is reversed compared to FIL; // thus, "<" in treelite corresonds to comparison ">=" used by FIL // https://github.com/dmlc/treelite/blob/master/include/treelite/tree.h#L243 - switch (node.comparison_op()) { + switch (comparison_op) { case tl::Operator::kLT: break; case tl::Operator::kLE: @@ -480,9 +463,9 @@ int find_class_label_from_one_hot(tl::tl_float* vector, int len) { } template -void tl2fil_leaf_payload(fil_node_t* fil_node, const tl::Tree::Node& tl_node, - const forest_params_t& forest_params) { - auto vec = tl_node.leaf_vector(); +void tl2fil_leaf_payload(fil_node_t* fil_node, const tl::Tree& tl_tree, + int tl_node_id, const forest_params_t& forest_params) { + auto vec = tl_tree.LeafVector(tl_node_id); switch (forest_params.leaf_payload_type) { case leaf_value_t::INT_CLASS_LABEL: ASSERT(vec.size() == forest_params.num_classes, @@ -490,8 +473,8 @@ void tl2fil_leaf_payload(fil_node_t* fil_node, const tl::Tree::Node& tl_node, fil_node->val.idx = find_class_label_from_one_hot(&vec[0], vec.size()); break; case leaf_value_t::FLOAT_SCALAR: - fil_node->val.f = tl_node.leaf_value(); - ASSERT(tl_node.leaf_vector().size() == 0, + fil_node->val.f = tl_tree.LeafValue(tl_node_id); + ASSERT(!tl_tree.HasLeafVector(tl_node_id), "some but not all treelite leaves have leaf_vector()"); break; default: @@ -500,61 +483,61 @@ void tl2fil_leaf_payload(fil_node_t* fil_node, const tl::Tree::Node& tl_node, } void node2fil_dense(std::vector* pnodes, int root, int cur, - const tl::Tree& tree, const tl::Tree::Node& node, + const tl::Tree& tree, int node_id, const forest_params_t& forest_params) { - if (node.is_leaf()) { + if (tree.IsLeaf(node_id)) { dense_node_init(&(*pnodes)[root + cur], val_t{.f = NAN}, NAN, 0, false, true); - tl2fil_leaf_payload(&(*pnodes)[root + cur], node, forest_params); + tl2fil_leaf_payload(&(*pnodes)[root + cur], tree, node_id, forest_params); return; } // inner node - ASSERT(node.split_type() == tl::SplitFeatureType::kNumerical, + ASSERT(tree.SplitType(node_id) == tl::SplitFeatureType::kNumerical, "only numerical split nodes are supported"); - int tl_left = node.cleft(), tl_right = node.cright(); - bool default_left = node.default_left(); - float threshold = node.threshold(); - adjust_threshold(&threshold, &tl_left, &tl_right, &default_left, node); + int tl_left = tree.LeftChild(node_id), tl_right = tree.RightChild(node_id); + bool default_left = tree.DefaultLeft(node_id); + float threshold = tree.Threshold(node_id); + adjust_threshold(&threshold, &tl_left, &tl_right, &default_left, + tree.ComparisonOp(node_id)); dense_node_init(&(*pnodes)[root + cur], val_t{.f = 0}, threshold, - node.split_index(), default_left, false); + tree.SplitIndex(node_id), default_left, false); int left = 2 * cur + 1; - node2fil_dense(pnodes, root, left, tree, tl_node_at(tree, tl_left), - forest_params); - node2fil_dense(pnodes, root, left + 1, tree, tl_node_at(tree, tl_right), - forest_params); + node2fil_dense(pnodes, root, left, tree, tl_left, forest_params); + node2fil_dense(pnodes, root, left + 1, tree, tl_right, forest_params); } void tree2fil_dense(std::vector* pnodes, int root, const tl::Tree& tree, const forest_params_t& forest_params) { - node2fil_dense(pnodes, root, 0, tree, tl_node_at(tree, tree_root(tree)), - forest_params); + node2fil_dense(pnodes, root, 0, tree, tree_root(tree), forest_params); } int tree2fil_sparse(std::vector* pnodes, const tl::Tree& tree, const forest_params_t& forest_params) { - typedef std::pair pair_t; + typedef std::pair pair_t; std::stack stack; int root = pnodes->size(); pnodes->push_back(sparse_node_t()); - stack.push(pair_t(&tl_node_at(tree, tree_root(tree)), 0)); + stack.push(pair_t(tree_root(tree), 0)); while (!stack.empty()) { const pair_t& top = stack.top(); - const tl::Tree::Node* node = top.first; + int node_id = top.first; int cur = top.second; stack.pop(); - while (!node->is_leaf()) { + while (!tree.IsLeaf(node_id)) { // inner node - ASSERT(node->split_type() == tl::SplitFeatureType::kNumerical, + ASSERT(tree.SplitType(node_id) == tl::SplitFeatureType::kNumerical, "only numerical split nodes are supported"); // tl_left and tl_right are indices of the children in the treelite tree // (stored as an array of nodes) - int tl_left = node->cleft(), tl_right = node->cright(); - bool default_left = node->default_left(); - float threshold = node->threshold(); - adjust_threshold(&threshold, &tl_left, &tl_right, &default_left, *node); + int tl_left = tree.LeftChild(node_id), + tl_right = tree.RightChild(node_id); + bool default_left = tree.DefaultLeft(node_id); + float threshold = tree.Threshold(node_id); + adjust_threshold(&threshold, &tl_left, &tl_right, &default_left, + tree.ComparisonOp(node_id)); // reserve space for child nodes // left is the offset of the left child node relative to the tree root @@ -563,19 +546,20 @@ int tree2fil_sparse(std::vector* pnodes, const tl::Tree& tree, pnodes->push_back(sparse_node_t()); pnodes->push_back(sparse_node_t()); sparse_node_init_inline(&(*pnodes)[root + cur], val_t{.f = 0}, threshold, - node->split_index(), default_left, false, left); + tree.SplitIndex(node_id), default_left, false, + left); // push child nodes into the stack - stack.push(pair_t(&tl_node_at(tree, tl_right), left + 1)); - //stack.push(pair_t(&tl_node_at(tree, tl_left), left)); - node = &tl_node_at(tree, tl_left); + stack.push(pair_t(tl_right, left + 1)); + //stack.push(pair_t(tl_left, left)); + node_id = tl_left; cur = left; } // leaf node sparse_node_init(&(*pnodes)[root + cur], val_t{.f = NAN}, NAN, 0, false, true, 0); - tl2fil_leaf_payload(&(*pnodes)[root + cur], *node, forest_params); + tl2fil_leaf_payload(&(*pnodes)[root + cur], tree, node_id, forest_params); } return root; @@ -584,11 +568,10 @@ int tree2fil_sparse(std::vector* pnodes, const tl::Tree& tree, size_t tl_leaf_vector_size(const tl::Model& model) { const tl::Tree& tree = model.trees[0]; int node_key; - for (node_key = tree_root(tree); !tl_node_at(tree, node_key).is_leaf(); - node_key = tl_node_at(tree, node_key).cright()) + for (node_key = tree_root(tree); !tree.IsLeaf(node_key); + node_key = tree.RightChild(node_key)) ; - const tl::Tree::Node& node = tl_node_at(tree, node_key); - if (node.has_leaf_vector()) return node.leaf_vector().size(); + if (tree.HasLeafVector(node_key)) return tree.LeafVector(node_key).size(); return 0; } @@ -627,11 +610,11 @@ void tl2fil_common(forest_params_t* params, const tl::Model& model, if (model.random_forest_flag) { params->output = output_t(params->output | output_t::AVG); } - if (param.pred_transform == "sigmoid") { + if (std::string(param.pred_transform) == "sigmoid") { params->output = output_t(params->output | output_t::SIGMOID); - } else if (param.pred_transform != "identity") { + } else if (std::string(param.pred_transform) != "identity") { ASSERT(false, "%s: unsupported treelite prediction transform", - param.pred_transform.c_str()); + param.pred_transform); } params->num_trees = model.trees.size(); } diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 1a17bbe0f8..2cd67bae31 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -284,71 +284,40 @@ void print_rf_detailed(const RandomForestMetaData* forest) { template void build_treelite_forest(ModelHandle* model, const RandomForestMetaData* forest, - int num_features, int task_category, - std::vector& data) { - bool check_val = (data).empty(); - if (not check_val) { - // create a temp file - const char* filename = std::tmpnam(nullptr); - // write the model bytes into the temp file - { - std::ofstream file(filename, std::ios::binary); - file.write((char*)&data[0], data.size()); - } - // read the file as a protobuf model - TREELITE_CHECK(TreeliteLoadProtobufModel(filename, model)); + int num_features, int task_category) { + // Non-zero value here for random forest models. + // The value should be set to 0 if the model is gradient boosted trees. + int random_forest_flag = 1; + ModelBuilderHandle model_builder; + // num_output_group is 1 for binary classification and regression + // num_output_group is #class for multiclass classification which is the same as task_category + int num_output_group = task_category > 2 ? task_category : 1; + TREELITE_CHECK(TreeliteCreateModelBuilder( + num_features, num_output_group, random_forest_flag, &model_builder)); + + if (task_category > 2) { + // Multi-class classification + TREELITE_CHECK(TreeliteModelBuilderSetModelParam( + model_builder, "pred_transform", "max_index")); } - else { - // Non-zero value here for random forest models. - // The value should be set to 0 if the model is gradient boosted trees. - int random_forest_flag = 1; - ModelBuilderHandle model_builder; - // num_output_group is 1 for binary classification and regression - // num_output_group is #class for multiclass classification which is the same as task_category - int num_output_group = task_category > 2 ? task_category : 1; - TREELITE_CHECK(TreeliteCreateModelBuilder( - num_features, num_output_group, random_forest_flag, &model_builder)); - - if (task_category > 2) { - // Multi-class classification - TREELITE_CHECK(TreeliteModelBuilderSetModelParam( - model_builder, "pred_transform", "max_index")); - } + for (int i = 0; i < forest->rf_params.n_trees; i++) { + DecisionTree::TreeMetaDataNode* tree_ptr = &forest->trees[i]; + TreeBuilderHandle tree_builder; - for (int i = 0; i < forest->rf_params.n_trees; i++) { - DecisionTree::TreeMetaDataNode* tree_ptr = &forest->trees[i]; - TreeBuilderHandle tree_builder; + TREELITE_CHECK(TreeliteCreateTreeBuilder(&tree_builder)); + if (tree_ptr->sparsetree.size() != 0) { + DecisionTree::build_treelite_tree(tree_builder, tree_ptr, + num_output_group); - TREELITE_CHECK(TreeliteCreateTreeBuilder(&tree_builder)); - if (tree_ptr->sparsetree.size() != 0) { - DecisionTree::build_treelite_tree(tree_builder, tree_ptr, - num_output_group); - - // The third argument -1 means append to the end of the tree list. - TREELITE_CHECK( - TreeliteModelBuilderInsertTree(model_builder, tree_builder, -1)); - } + // The third argument -1 means append to the end of the tree list. + TREELITE_CHECK( + TreeliteModelBuilderInsertTree(model_builder, tree_builder, -1)); } - - TREELITE_CHECK(TreeliteModelBuilderCommitModel(model_builder, model)); - TREELITE_CHECK(TreeliteDeleteModelBuilder(model_builder)); } -} -std::vector save_model(ModelHandle model) { - // create a temp file - const char* filename = std::tmpnam(nullptr); - // export the treelite model to protobuf nd save it in the temp file - TreeliteExportProtobufModel(filename, model); - // read from the temp file and obtain the model bytes - std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary); - in.seekg(0, std::ios::end); - int size_of_file = in.tellg(); - vector bytes_info(size_of_file, 0); - ifstream infile(filename, ios::in | ios::binary); - infile.read((char*)&bytes_info[0], bytes_info.size()); - return bytes_info; + TREELITE_CHECK(TreeliteModelBuilderCommitModel(model_builder, model)); + TREELITE_CHECK(TreeliteDeleteModelBuilder(model_builder)); } /** @@ -367,31 +336,28 @@ void compare_trees(tl::Tree& tree_from_concatenated_forest, " the tree present in the individual forests"); for (int each_node = 0; each_node < tree_from_concatenated_forest.num_nodes; each_node++) { - tl::Tree::Node& node_from_concat = tree_from_concatenated_forest[each_node]; - tl::Tree::Node& node_from_indiv = tree_from_individual_forest[each_node]; - ASSERT(node_from_concat.is_root() == node_from_indiv.is_root(), - "Error! root position mismatch between concatenated forest and the" - " individual forests "); - ASSERT(node_from_concat.parent() == node_from_indiv.parent(), - "Error! node parent mismatch between concatenated forest and the" - " individual forests "); - ASSERT(node_from_concat.is_leaf() == node_from_indiv.is_leaf(), + ASSERT(tree_from_concatenated_forest.IsLeaf(each_node) == + tree_from_individual_forest.IsLeaf(each_node), "Error! mismatch in the position of a leaf between concatenated " "forest and the" " individual forests "); - ASSERT(node_from_concat.leaf_value() == node_from_indiv.leaf_value(), + ASSERT(tree_from_concatenated_forest.LeafValue(each_node) == + tree_from_individual_forest.LeafValue(each_node), "Error! leaf value mismatch between concatenated forest and the" " individual forests "); - ASSERT(node_from_concat.cright() == node_from_indiv.cright(), + ASSERT(tree_from_concatenated_forest.RightChild(each_node) == + tree_from_individual_forest.RightChild(each_node), "Error! mismatch in the position of the node between concatenated " "forest and the" " individual forests "); - ASSERT(node_from_concat.cleft() == node_from_indiv.cleft(), + ASSERT(tree_from_concatenated_forest.LeftChild(each_node) == + tree_from_individual_forest.LeftChild(each_node), "Error! mismatch in the position of the node between concatenated " "forest and the" " individual forests "); ASSERT( - node_from_concat.split_index() == node_from_indiv.split_index(), + tree_from_concatenated_forest.SplitIndex(each_node) == + tree_from_individual_forest.SplitIndex(each_node), "Error! split index value mismatch between concatenated forest and the" " individual forests "); } @@ -462,8 +428,9 @@ ModelHandle concatenate_trees(std::vector treelite_handles) { tl::Model* concat_model = new tl::Model; for (int forest_idx = 0; forest_idx < treelite_handles.size(); forest_idx++) { tl::Model& model = *(tl::Model*)treelite_handles[forest_idx]; - concat_model->trees.insert(concat_model->trees.end(), model.trees.begin(), - model.trees.end()); + for (const tl::Tree& tree : model.trees) { + concat_model->trees.push_back(tree.Clone()); + } } concat_model->num_feature = first_model.num_feature; concat_model->num_output_group = first_model.num_output_group; @@ -785,14 +752,14 @@ template void delete_rf_metadata( template void build_treelite_forest( ModelHandle* model, const RandomForestMetaData* forest, - int num_features, int task_category, std::vector& data); + int num_features, int task_category); template void build_treelite_forest( ModelHandle* model, const RandomForestMetaData* forest, - int num_features, int task_category, std::vector& data); + int num_features, int task_category); template void build_treelite_forest( ModelHandle* model, const RandomForestMetaData* forest, - int num_features, int task_category, std::vector& data); + int num_features, int task_category); template void build_treelite_forest( ModelHandle* model, const RandomForestMetaData* forest, - int num_features, int task_category, std::vector& data); + int num_features, int task_category); } // End namespace ML diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu index 9fafc3513c..9f7c9f4f50 100644 --- a/cpp/test/sg/fil_test.cu +++ b/cpp/test/sg/fil_test.cu @@ -450,7 +450,7 @@ class TreeliteFilTest : public BaseFilTest { int node_to_treelite(tlf::TreeBuilder* builder, int* pkey, int root, int node) { int key = (*pkey)++; - TL_CPP_CHECK(builder->CreateNode(key)); + builder->CreateNode(key); int feature; float threshold; fil::val_t output; @@ -461,13 +461,13 @@ class TreeliteFilTest : public BaseFilTest { switch (ps.leaf_payload_type) { case fil::leaf_value_t::FLOAT_SCALAR: // default is fil::FLOAT_SCALAR - TL_CPP_CHECK(builder->SetLeafNode(key, output.f)); + builder->SetLeafNode(key, output.f); break; case fil::leaf_value_t::INT_CLASS_LABEL: std::vector vec(ps.num_classes); for (int i = 0; i < ps.num_classes; ++i) vec[i] = i == output.idx ? 1.0f : 0.0f; - TL_CPP_CHECK(builder->SetLeafVectorNode(key, vec)); + builder->SetLeafVectorNode(key, vec); } } else { int left = root + 2 * (node - root) + 1; @@ -494,8 +494,8 @@ class TreeliteFilTest : public BaseFilTest { } int left_key = node_to_treelite(builder, pkey, root, left); int right_key = node_to_treelite(builder, pkey, root, right); - TL_CPP_CHECK(builder->SetNumericalTestNode( - key, feature, ps.op, threshold, default_left, left_key, right_key)); + builder->SetNumericalTestNode(key, feature, ps.op, threshold, + default_left, left_key, right_key); } return key; } @@ -527,14 +527,14 @@ class TreeliteFilTest : public BaseFilTest { int key_counter = 0; int root = i_tree * tree_num_nodes(); int root_key = node_to_treelite(tree_builder, &key_counter, root, root); - TL_CPP_CHECK(tree_builder->SetRootNode(root_key)); + tree_builder->SetRootNode(root_key); // InsertTree() consumes tree_builder TL_CPP_CHECK(model_builder->InsertTree(tree_builder)); } // commit the model std::unique_ptr model(new tl::Model); - TL_CPP_CHECK(model_builder->CommitModel(model.get())); + model_builder->CommitModel(model.get()); // init FIL forest with the model fil::treelite_params_t params; diff --git a/cpp/test/sg/rf_treelite_test.cu b/cpp/test/sg/rf_treelite_test.cu index b7d3162f79..eb8d254fa7 100644 --- a/cpp/test/sg/rf_treelite_test.cu +++ b/cpp/test/sg/rf_treelite_test.cu @@ -342,7 +342,6 @@ class RfConcatTestClf : public RfTreeliteTestCommon { for (int i = 0; i < 3; i++) { ModelHandle model; - std::vector vec_data; this->rf_params.n_trees = this->rf_params.n_trees + i; @@ -350,7 +349,7 @@ class RfConcatTestClf : public RfTreeliteTestCommon { this->params.n_rows, this->params.n_cols, this->labels_d, labels_map.size(), this->rf_params); build_treelite_forest(&model, this->all_forest_info[i], - this->params.n_cols, this->task_category, vec_data); + this->params.n_cols, this->task_category); this->treelite_indiv_handles.push_back(model); } @@ -412,14 +411,14 @@ class RfConcatTestReg : public RfTreeliteTestCommon { for (int i = 0; i < 3; i++) { ModelHandle model; - std::vector vec_data; + this->rf_params.n_trees = this->rf_params.n_trees + i; fit(*(this->handle), this->all_forest_info[i], this->data_d, this->params.n_rows, this->params.n_cols, this->labels_d, this->rf_params); build_treelite_forest(&model, this->all_forest_info[i], - this->params.n_cols, this->task_category, vec_data); + this->params.n_cols, this->task_category); CUDA_CHECK(cudaStreamSynchronize(this->stream)); this->treelite_indiv_handles.push_back(model); } diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py index 058e1c1b95..4e3d34d994 100644 --- a/python/cuml/benchmark/algorithms.py +++ b/python/cuml/benchmark/algorithms.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,7 +44,6 @@ if has_treelite(): import treelite - import treelite.runtime if has_umap(): import umap @@ -193,11 +192,10 @@ def _treelite_format_hook(data): from cuml.common.import_utils import has_treelite if has_treelite(): - import treelite - import treelite.runtime + import treelite_runtime else: raise ImportError("No treelite package found") - return treelite.runtime.Batch.from_npy2d(data[0]), data[1] + return treelite_runtime.Batch.from_npy2d(data[0]), data[1] def all_algorithms(): diff --git a/python/cuml/benchmark/bench_helper_funcs.py b/python/cuml/benchmark/bench_helper_funcs.py index ccb81ee9ab..58c75fee81 100644 --- a/python/cuml/benchmark/bench_helper_funcs.py +++ b/python/cuml/benchmark/bench_helper_funcs.py @@ -149,7 +149,7 @@ def _build_treelite_classifier(m, data, args, tmpdir): from cuml.common.import_utils import has_treelite, has_xgboost if has_treelite(): import treelite - import treelite.runtime + import treelite_runtime else: raise ImportError("No treelite package found") if has_xgboost(): @@ -168,10 +168,11 @@ def _build_treelite_classifier(m, data, args, tmpdir): bst.load_model(model_path) tl_model = treelite.Model.from_xgboost(bst) tl_model.export_lib( - toolchain="gcc", libpath=model_path+"treelite.so", + toolchain="gcc", libpath=os.path.join(tmpdir, 'treelite.so'), params={'parallel_comp': 40}, verbose=False ) - return treelite.runtime.Predictor(model_path+"treelite.so", verbose=False) + return treelite_runtime.Predictor(os.path.join(tmpdir, 'treelite.so'), + verbose=False) def _treelite_fil_accuracy_score(y_true, y_pred): diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py index df7394da86..fe94321db0 100644 --- a/python/cuml/dask/ensemble/base.py +++ b/python/cuml/dask/ensemble/base.py @@ -107,17 +107,19 @@ def _concat_treelite_models(self): to create a single model. The concatenated model is then converted to bytes format. """ - model_protobuf_futures = list() + model_serialized_futures = list() for w in self.workers: - model_protobuf_futures.append( - dask.delayed(_get_protobuf_bytes) + model_serialized_futures.append( + dask.delayed(_get_serialized_model) (self.rfs[w])) - mod_bytes = self.client.compute(model_protobuf_futures, sync=True) + mod_bytes = self.client.compute(model_serialized_futures, sync=True) last_worker = w all_tl_mod_handles = [] model = self.rfs[last_worker].result() - all_tl_mod_handles = [model._tl_model_handles(pbuf_bytes) - for pbuf_bytes in mod_bytes] + all_tl_mod_handles = [ + model._tl_handle_from_bytes(indiv_worker_model_bytes) + for indiv_worker_model_bytes in mod_bytes + ] model._concatenate_treelite_handle(all_tl_mod_handles) for tl_handle in all_tl_mod_handles: @@ -194,5 +196,5 @@ def _func_set_params(model, **params): return model.set_params(**params) -def _get_protobuf_bytes(model): - return model._get_protobuf_bytes() +def _get_serialized_model(model): + return model._get_serialized_model() diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index 0531685efa..97a51ad6bd 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -41,8 +41,6 @@ cimport cuml.common.cuda cdef extern from "treelite/c_api.h": ctypedef void* ModelHandle ctypedef void* ModelBuilderHandle - cdef int TreeliteExportProtobufModel(const char* filename, - ModelHandle model) cdef const char* TreeliteGetLastError() cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": @@ -99,10 +97,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": cdef void build_treelite_forest[T, L](ModelHandle*, RandomForestMetaData[T, L]*, int, - int, - vector[unsigned char] &) except + - - cdef vector[unsigned char] save_model_protobuf(ModelHandle) except + + int) except + cdef void delete_rf_metadata[T, L](RandomForestMetaData[T, L]*) except + cdef void print_rf_summary[T, L](RandomForestMetaData[T, L]*) except + diff --git a/python/cuml/ensemble/randomforest_shared.pyx b/python/cuml/ensemble/randomforest_shared.pyx new file mode 100644 index 0000000000..c8fe539f59 --- /dev/null +++ b/python/cuml/ensemble/randomforest_shared.pyx @@ -0,0 +1,123 @@ +# +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +from libcpp.vector cimport vector +from cython.operator cimport dereference as deref, preincrement as inc +from cpython.object cimport PyObject +from libc.stdint cimport uintptr_t +from typing import Tuple, Dict, List, Union +import numpy as np + +cdef extern from "treelite/tree.h" namespace "treelite": + cdef struct PyBufferFrame: + void* buf + char* format + size_t itemsize + size_t nitem + cdef cppclass Model: + vector[PyBufferFrame] GetPyBuffer() except + + void InitFromPyBuffer(vector[PyBufferFrame] frames) except + + +cdef extern from "Python.h": + Py_buffer* PyMemoryView_GET_BUFFER(PyObject* mview) + +cdef class PyBufferFrameWrapper: + cdef PyBufferFrame _handle + cdef Py_ssize_t shape[1] + cdef Py_ssize_t strides[1] + + def __cinit__(self): + pass + + def __dealloc__(self): + pass + + def __getbuffer__(self, Py_buffer* buffer, int flags): + cdef Py_ssize_t itemsize = self._handle.itemsize + + self.shape[0] = self._handle.nitem + self.strides[0] = itemsize + + buffer.buf = self._handle.buf + buffer.format = self._handle.format + buffer.internal = NULL + buffer.itemsize = itemsize + buffer.len = self._handle.nitem * itemsize + buffer.ndim = 1 + buffer.obj = self + buffer.readonly = 0 + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = NULL + + def __releasebuffer__(self, Py_buffer *buffer): + pass + +cdef PyBufferFrameWrapper MakePyBufferFrameWrapper(PyBufferFrame handle): + cdef PyBufferFrameWrapper wrapper = PyBufferFrameWrapper() + wrapper._handle = handle + return wrapper + +cdef list _get_frames(ModelHandle model): + return [memoryview(MakePyBufferFrameWrapper(v)) + for v in (model).GetPyBuffer()] + +cdef ModelHandle _init_from_frames(vector[PyBufferFrame] frames) except *: + cdef Model* model_obj = new Model() + model_obj.InitFromPyBuffer(frames) + return model_obj + + +def get_frames(model: uintptr_t) -> List[memoryview]: + return _get_frames( model) + + +def init_from_frames(frames: List[np.ndarray], + format_str: List[str], itemsize: List[int]) -> uintptr_t: + cdef vector[PyBufferFrame] cpp_frames + cdef Py_buffer* buf + cdef PyBufferFrame cpp_frame + format_bytes = [s.encode('utf-8') for s in format_str] + for i, frame in enumerate(frames): + x = memoryview(frame) + buf = PyMemoryView_GET_BUFFER(x) + cpp_frame.buf = buf.buf + cpp_frame.format = format_bytes[i] + cpp_frame.itemsize = itemsize[i] + cpp_frame.nitem = buf.len // itemsize[i] + cpp_frames.push_back(cpp_frame) + return _init_from_frames(cpp_frames) + + +def treelite_serialize( + model: uintptr_t +) -> Dict[str, Union[List[str], List[np.ndarray]]]: + frames = get_frames(model) + header = {'format_str': [x.format for x in frames], + 'itemsize': [x.itemsize for x in frames]} + return {'header': header, 'frames': [np.asarray(x) for x in frames]} + + +def treelite_deserialize( + payload: Dict[str, Union[List[str], List[bytes]]] +) -> uintptr_t: + header, frames = payload['header'], payload['frames'] + return init_from_frames(frames, header['format_str'], header['itemsize']) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index bb38f03ad9..8b5a873e3c 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -19,7 +19,6 @@ # cython: embedsignature = True # cython: language_level = 3 -import ctypes import cudf import cupy as cp import math @@ -42,6 +41,8 @@ from cuml.common.handle cimport cumlHandle from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model from cuml.ensemble.randomforest_shared cimport * +from cuml.ensemble.randomforest_shared import treelite_serialize, \ + treelite_deserialize from cuml.fil.fil import TreeliteModel from cuml.common import input_to_cuml_array, rmm_cupy_ary from cuml.common import get_cudf_column_ptr, zeros @@ -304,7 +305,7 @@ class RandomForestClassifier(Base): "random_seed is set") self.rf_forest = 0 self.rf_forest64 = 0 - self.model_pbuf_bytes = bytearray() + self.treelite_serialized_model = None """ TODO: @@ -321,7 +322,7 @@ class RandomForestClassifier(Base): cdef size_t params_t64 if self.n_cols: # only if model has been fit previously - self._get_protobuf_bytes() # Ensure we have this cached + self._get_serialized_model() # Ensure we have this cached if self.rf_forest: params_t = self.rf_forest rf_forest = \ @@ -336,7 +337,7 @@ class RandomForestClassifier(Base): state['n_cols'] = self.n_cols state["verbose"] = self.verbose - state["model_pbuf_bytes"] = self.model_pbuf_bytes + state["treelite_serialized_model"] = self.treelite_serialized_model state["treelite_handle"] = None state['handle'] = self.handle @@ -358,7 +359,7 @@ class RandomForestClassifier(Base): rf_forest64.rf_params = state["rf_params64"] state["rf_forest64"] = rf_forest64 - self.model_pbuf_bytes = state["model_pbuf_bytes"] + self.treelite_serialized_model = state["treelite_serialized_model"] self.__dict__.update(state) def __del__(self): @@ -381,7 +382,7 @@ class RandomForestClassifier(Base): TreeliteModel.free_treelite_model(self.treelite_handle) self.treelite_handle = None - self.model_pbuf_bytes = bytearray() + self.treelite_serialized_model = None self.n_cols = None def _get_max_feat_val(self): @@ -405,12 +406,11 @@ class RandomForestClassifier(Base): delete the returned model.""" if self.treelite_handle is not None: return self.treelite_handle # Cached version - - cdef ModelHandle cuml_model_ptr = NULL + cdef ModelHandle tl_handle = NULL cdef RandomForestMetaData[float, int] *rf_forest = \ self.rf_forest - assert len(self.model_pbuf_bytes) > 0 or self.rf_forest, \ + assert self.treelite_serialized_model or self.rf_forest, \ "Attempting to create treelite from un-fit forest." if self.num_classes > 2: @@ -418,51 +418,39 @@ class RandomForestClassifier(Base): "classification models is currently not " "implemented. Please check cuml issue " "#1679 for more information.") - cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes - cdef vector[unsigned char] model_pbuf_vec - with cython.boundscheck(False): - model_pbuf_vec.assign(& model_pbuf_mv[0], - & model_pbuf_mv[model_pbuf_mv.shape[0]]) - - task_category = CLASSIFICATION_MODEL - build_treelite_forest( - & cuml_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_pbuf_vec) - mod_ptr = cuml_model_ptr - self.treelite_handle = ctypes.c_void_p(mod_ptr).value + if self.treelite_serialized_model: # bytes -> Treelite + tl_handle = treelite_deserialize( + self.treelite_serialized_model) + else: # RF -> Treelite + task_category = CLASSIFICATION_MODEL + build_treelite_forest( + &tl_handle, + rf_forest, + self.n_cols, + task_category) + self.treelite_handle = tl_handle return self.treelite_handle - def _get_protobuf_bytes(self): + def _get_serialized_model(self): """ - Returns the self.model_pbuf_bytes. + Returns the self.treelite_serialized_model. Cuml RF model gets converted to treelite protobuf bytes by: 1. converting the cuml RF model to a treelite model. The treelite models handle (pointer) is returned - 2. The treelite model handle is used to convert the treelite model - to a treelite protobuf model which is stored in a temporary file. - The protobuf model information is read from the temporary file and - the byte information is returned. - The treelite handle is stored `self.treelite_handle` and the treelite - protobuf model bytes are stored in `self.model_pbuf_bytes`. If either - of information is already present in the model then the respective - step is skipped. + 2. The treelite model handle is converted to bytes. + The treelite model bytes are stored in + `self.treelite_serialized_model`. If the model bytes are present, we + can skip _obtain_treelite_handle(). """ - if self.model_pbuf_bytes: - return self.model_pbuf_bytes + if self.treelite_serialized_model: + return self.treelite_serialized_model elif self.treelite_handle: fit_mod_ptr = self.treelite_handle else: fit_mod_ptr = self._obtain_treelite_handle() cdef uintptr_t model_ptr = fit_mod_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - return self.model_pbuf_bytes + self.treelite_serialized_model = treelite_serialize(model_ptr) + return self.treelite_serialized_model def convert_to_treelite_model(self): """ @@ -533,19 +521,11 @@ class RandomForestClassifier(Base): TODO : Move functions duplicated in the RF classifier and regressor to a shared file. Cuml issue #1854 has been created to track this. """ - def _tl_model_handles(self, model_bytes): - cdef ModelHandle cuml_model_ptr = NULL - cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest - task_category = CLASSIFICATION_MODEL - build_treelite_forest(& cuml_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_bytes) - mod_handle = cuml_model_ptr - - return ctypes.c_void_p(mod_handle).value + def _tl_handle_from_bytes(self, treelite_serialized_model): + if not treelite_serialized_model: + raise ValueError( + '_tl_handle_from_bytes() requires non-empty serialized model') + return treelite_deserialize(treelite_serialized_model) def _concatenate_treelite_handle(self, treelite_handle): cdef ModelHandle concat_model_handle = NULL @@ -561,12 +541,7 @@ class RandomForestClassifier(Base): concat_model_handle = concatenate_trees(deref(model_handles)) cdef uintptr_t concat_model_ptr = concat_model_handle self.treelite_handle = concat_model_ptr - - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( concat_model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + self.treelite_serialized_model = treelite_serialize(concat_model_ptr) # Fix up some instance variables that should match the new TL model tl_model = TreeliteModel.from_treelite_model_handle( @@ -710,7 +685,6 @@ class RandomForestClassifier(Base): num_classes, convert_dtype, fil_sparse_format, predict_proba): out_type = self._get_output_type(X) - cdef ModelHandle cuml_model_ptr = NULL _, n_rows, n_cols, dtype = \ input_to_cuml_array(X, order='F', check_cols=self.n_cols) @@ -1160,7 +1134,7 @@ class RandomForestClassifier(Base): ----------- params : dict of new params """ - self.model_pbuf_bytes = [] + self.treelite_serialized_model = None if not params: return self diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index a648ef1d10..293d4a7ffb 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -19,7 +19,6 @@ # cython: embedsignature = True # cython: language_level = 3 -import ctypes import cudf import math import numpy as np @@ -39,6 +38,8 @@ from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model from cuml.ensemble.randomforest_shared cimport * +from cuml.ensemble.randomforest_shared import treelite_serialize, \ + treelite_deserialize from cuml.fil.fil import TreeliteModel from cuml.common import input_to_cuml_array, input_to_dev_array, \ zeros, get_cudf_column_ptr @@ -277,10 +278,10 @@ class RandomForestRegressor(Base): self.max_depth = max_depth self.max_features = max_features self.bootstrap = bootstrap + self.treelite_handle = None self.n_bins = n_bins self.n_cols = None self.dtype = None - self.treelite_handle = None self.accuracy_metric = accuracy_metric self.quantile_per_tree = quantile_per_tree self.n_streams = handle.getNumInternalStreams() @@ -290,7 +291,7 @@ class RandomForestRegressor(Base): " the exact same results at this time.") self.rf_forest = None self.rf_forest64 = None - self.model_pbuf_bytes = bytearray() + self.treelite_serialized_model = None """ TODO: @@ -305,7 +306,7 @@ class RandomForestRegressor(Base): cdef size_t params_t64 if self.n_cols: # only if model has been fit previously - self._get_protobuf_bytes() # Ensure we have this cached + self._get_serialized_model() # Ensure we have this cached if self.rf_forest: params_t = self.rf_forest rf_forest = \ @@ -320,7 +321,7 @@ class RandomForestRegressor(Base): state['n_cols'] = self.n_cols state["verbose"] = self.verbose - state["model_pbuf_bytes"] = self.model_pbuf_bytes + state["treelite_serialized_model"] = self.treelite_serialized_model state['handle'] = self.handle state["treelite_handle"] = None @@ -342,7 +343,7 @@ class RandomForestRegressor(Base): rf_forest64.rf_params = state["rf_params64"] state["rf_forest64"] = rf_forest64 - self.model_pbuf_bytes = state["model_pbuf_bytes"] + self.treelite_serialized_model = state["treelite_serialized_model"] self.__dict__.update(state) def __del__(self): @@ -365,7 +366,7 @@ class RandomForestRegressor(Base): TreeliteModel.free_treelite_model(self.treelite_handle) self.treelite_handle = None - self.model_pbuf_bytes = bytearray() + self.treelite_serialized_model = None self.n_cols = None def _get_max_feat_val(self): @@ -391,58 +392,45 @@ class RandomForestRegressor(Base): delete the returned model.""" if self.treelite_handle is not None: return self.treelite_handle # Cached version - - cdef ModelHandle cuml_model_ptr = NULL + cdef ModelHandle tl_handle = NULL cdef RandomForestMetaData[float, float] *rf_forest = \ self.rf_forest - assert len(self.model_pbuf_bytes) > 0 or self.rf_forest, \ + assert self.treelite_serialized_model or self.rf_forest, \ "Attempting to create treelite from un-fit forest." - cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes - cdef vector[unsigned char] model_pbuf_vec - with cython.boundscheck(False): - model_pbuf_vec.assign(& model_pbuf_mv[0], - & model_pbuf_mv[model_pbuf_mv.shape[0]]) - - task_category = REGRESSION_MODEL - build_treelite_forest( - & cuml_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_pbuf_vec) - mod_ptr = cuml_model_ptr - self.treelite_handle = ctypes.c_void_p(mod_ptr).value + if self.treelite_serialized_model: # bytes -> Treelite + tl_handle = treelite_deserialize( + self.treelite_serialized_model) + else: # RF -> Treelite + task_category = REGRESSION_MODEL + build_treelite_forest( + &tl_handle, + rf_forest, + self.n_cols, + task_category) + self.treelite_handle = tl_handle return self.treelite_handle - def _get_protobuf_bytes(self): + def _get_serialized_model(self): """ - Returns the self.model_pbuf_bytes. + Returns the self.treelite_serialized_model. Cuml RF model gets converted to treelite protobuf bytes by: 1. converting the cuml RF model to a treelite model. The treelite models handle (pointer) is returned - 2. The treelite model handle is used to convert the treelite model - to a treelite protobuf model which is stored in a temporary file. - The protobuf model information is read from the temporary file and - the byte information is returned. - The treelite handle is stored `self.treelite_handle` and the treelite - protobuf model bytes are stored in `self.model_pbuf_bytes`. If either - of information is already present in the model then the respective - step is skipped. + 2. The treelite model handle is converted to bytes. + The treelite model bytes are stored in + `self.treelite_serialized_model`. If the model bytes are present, we + can skip _obtain_treelite_handle(). """ - if self.model_pbuf_bytes: - return self.model_pbuf_bytes + if self.treelite_serialized_model: + return self.treelite_serialized_model elif self.treelite_handle: fit_mod_ptr = self.treelite_handle else: fit_mod_ptr = self._obtain_treelite_handle() cdef uintptr_t model_ptr = fit_mod_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - return self.model_pbuf_bytes + self.treelite_serialized_model = treelite_serialize(model_ptr) + return self.treelite_serialized_model def convert_to_treelite_model(self): """ @@ -510,19 +498,11 @@ class RandomForestRegressor(Base): TODO : Move functions duplicated in the RF classifier and regressor to a shared file. Cuml issue #1854 has been created to track this. """ - def _tl_model_handles(self, model_bytes): - task_category = REGRESSION_MODEL - cdef ModelHandle tl_model_ptr = NULL - cdef RandomForestMetaData[float, float] *rf_forest = \ - self.rf_forest - build_treelite_forest(& tl_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_bytes) - mod_handle = tl_model_ptr - - return ctypes.c_void_p(mod_handle).value + def _tl_handle_from_bytes(self, treelite_serialized_model): + if not treelite_serialized_model: + raise ValueError( + '_tl_handle_from_bytes() requires non-empty serialized model') + return treelite_deserialize(treelite_serialized_model) def _concatenate_treelite_handle(self, treelite_handle): cdef ModelHandle concat_model_handle = NULL @@ -538,11 +518,7 @@ class RandomForestRegressor(Base): concat_model_handle = concatenate_trees(deref(model_handles)) cdef uintptr_t concat_model_ptr = concat_model_handle self.treelite_handle = concat_model_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( concat_model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + self.treelite_serialized_model = treelite_serialize(concat_model_ptr) # Fix up some instance variables that should match the new TL model tl_model = TreeliteModel.from_treelite_model_handle( @@ -659,7 +635,6 @@ class RandomForestRegressor(Base): def _predict_model_on_gpu(self, X, algo, convert_dtype, fil_sparse_format): out_type = self._get_output_type(X) - cdef ModelHandle cuml_model_ptr = NULL _, n_rows, n_cols, dtype = \ input_to_cuml_array(X, order='F', check_cols=self.n_cols) @@ -933,7 +908,7 @@ class RandomForestRegressor(Base): ----------- params : dict of new params """ - self.model_pbuf_bytes = [] + self.treelite_serialized_model = None if not params: return self for key, value in params.items():