diff --git a/fate_client b/fate_client index 72594a7c4c..bbf204c382 160000 --- a/fate_client +++ b/fate_client @@ -1 +1 @@ -Subproject commit 72594a7c4cf5d8f21c0baf7acb8fc303b32482b3 +Subproject commit bbf204c382e4a8f77fec7706509c9cf32fa85446 diff --git a/python/fate/components/components/__init__.py b/python/fate/components/components/__init__.py index a33a031132..e0dd6442e0 100644 --- a/python/fate/components/components/__init__.py +++ b/python/fate/components/components/__init__.py @@ -80,7 +80,7 @@ def homo_lr(self): @_lazy_cpn def hetero_sbt(self): - from .hetero_sbt import hetero_sbt + from .hetero_secureboost import hetero_sbt return hetero_sbt diff --git a/python/fate/components/components/hetero_sbt.py b/python/fate/components/components/hetero_secureboost.py similarity index 91% rename from python/fate/components/components/hetero_sbt.py rename to python/fate/components/components/hetero_secureboost.py index 95361fc631..e7f88e3d78 100644 --- a/python/fate/components/components/hetero_sbt.py +++ b/python/fate/components/components/hetero_secureboost.py @@ -16,7 +16,6 @@ import logging from fate.arch import Context -from fate.components.components.utils import consts from fate.components.core import GUEST, HOST, Role, cpn, params from fate.ml.ensemble import HeteroSecureBoostGuest, HeteroSecureBoostHost, BINARY_BCE, MULTI_CE, REGRESSION_L2 from fate.components.components.utils.tools import add_dataset_type @@ -41,6 +40,8 @@ def train( desc="max tree num"), learning_rate: cpn.parameter(type=params.confloat(gt=0), default=0.3, desc='decay factor of each tree'), max_depth: cpn.parameter(type=params.conint(gt=0), default=3, desc='max depth of a tree'), + complete_secure: cpn.parameter(type=params.conint(ge=0), default=0, desc='number of trees to use guest features only in the complete secure mode, ' + '0 means no complete secure'), max_bin: cpn.parameter(type=params.conint(gt=0), default=32, desc='max bin number of feature binning'), objective: cpn.parameter(type=params.string_choice(choice=[BINARY_BCE, MULTI_CE, REGRESSION_L2]), default=BINARY_BCE, \ desc='objective function, available: {}'.format([BINARY_BCE, MULTI_CE, REGRESSION_L2])), @@ -71,12 +72,13 @@ def train( # initialize encrypt kit ctx.cipher.set_phe(ctx.device, he_param.dict()) - booster = HeteroSecureBoostGuest(num_trees=num_trees, max_depth=max_depth, learning_rate=learning_rate, max_bin=max_bin, + booster = HeteroSecureBoostGuest(num_trees=num_trees, max_depth=max_depth, complete_secure=complete_secure, + learning_rate=learning_rate, max_bin=max_bin, l2=l2, min_impurity_split=min_impurity_split, min_sample_split=min_sample_split, min_leaf_node=min_leaf_node, min_child_weight=min_child_weight, objective=objective, num_class=num_class, gh_pack=gh_pack, split_info_pack=split_info_pack, hist_sub=hist_sub ) - if train_model_input is not None: + if train_model_input: booster.from_model(train_model_input) logger.info('sbt input model loaded, will start warmstarting') booster.fit(ctx, train_data, validate_data) @@ -90,7 +92,8 @@ def train( elif role.is_host: - booster = HeteroSecureBoostHost(num_trees=num_trees, max_depth=max_depth, learning_rate=learning_rate, max_bin=max_bin, hist_sub=hist_sub) + booster = HeteroSecureBoostHost(num_trees=num_trees, max_depth=max_depth, complete_secure=complete_secure, + max_bin=max_bin, hist_sub=hist_sub) if train_model_input is not None: booster.from_model(train_model_input) logger.info('sbt input model loaded, will start warmstarting') diff --git a/python/fate/ml/ensemble/algo/secureboost/hetero/_base.py b/python/fate/ml/ensemble/algo/secureboost/hetero/_base.py index e5a1173741..f184435e42 100644 --- a/python/fate/ml/ensemble/algo/secureboost/hetero/_base.py +++ b/python/fate/ml/ensemble/algo/secureboost/hetero/_base.py @@ -35,17 +35,20 @@ def _update_feature_importance(self, fi_dict: Dict[int, FeatureImportance]): else: self._global_feature_importance[fid] = self._global_feature_importance[fid] + fi - def _sum_leaf_weights(self, leaf_pos: DataFrame, trees, learing_rate: float, loss_func): - def _compute_score(leaf_pos: np.array, trees: List[List[Node]], learning_rate: float): - score = 0 - leaf_pos = leaf_pos["sample_pos"] - for node_idx, tree in zip(leaf_pos, trees): + def _sum_leaf_weights(self, leaf_pos: DataFrame, trees, learing_rate: float, loss_func, num_dim=1): + def _compute_score(leaf_pos_: np.array, trees_: List[List[Node]], learning_rate: float, num_dim_=1): + score = np.zeros(num_dim_) + leaf_pos_ = leaf_pos_["sample_pos"] + tree_idx = 0 + for node_idx, tree in zip(leaf_pos_, trees_): recovered_idx = -(node_idx + 1) - score += tree[recovered_idx].weight * learning_rate - return score + score[tree_idx % num_dim_] += tree[recovered_idx].weight * learning_rate + tree_idx += 1 + + return float(score[0]) if num_dim_ == 1 else [score] tree_list = [tree.get_nodes() for tree in trees] - apply_func = functools.partial(_compute_score, trees=tree_list, learning_rate=learing_rate) + apply_func = functools.partial(_compute_score, trees_=tree_list, learning_rate=learing_rate, num_dim_=num_dim) predict_score = leaf_pos.create_frame() predict_score["score"] = leaf_pos.apply_row(apply_func) return loss_func.predict(predict_score) diff --git a/python/fate/ml/ensemble/algo/secureboost/hetero/guest.py b/python/fate/ml/ensemble/algo/secureboost/hetero/guest.py index 60da01117a..348239747e 100644 --- a/python/fate/ml/ensemble/algo/secureboost/hetero/guest.py +++ b/python/fate/ml/ensemble/algo/secureboost/hetero/guest.py @@ -67,15 +67,23 @@ def select_func(s: pd.Series, idx): return target_gh -def _accumulate_scores(acc_scores: DataFrame, new_scores: DataFrame, learning_rate: float, multi_class=False): +def _accumulate_scores(acc_scores: DataFrame, new_scores: DataFrame, learning_rate: float, + multi_class=False, class_num=None, dim=0): + def _extend_score(s: pd.Series, class_num, dim): + new_s = pd.Series() + new_s['score'] = np.zeros(class_num) + new_s['score'][dim] = s['score'] + return new_s + + new_scores = new_scores.loc(acc_scores.get_indexer(target="sample_id"), preserve_order=True) if not multi_class: acc_scores = acc_scores + new_scores * learning_rate else: - pass + extend_scores = new_scores.apply_row(lambda s: _extend_score(s, class_num, dim), columns=["score"]) + acc_scores = acc_scores + extend_scores * learning_rate return acc_scores - class HeteroSecureBoostGuest(HeteroBoostingTree): def __init__( self, @@ -84,7 +92,7 @@ def __init__( complete_secure=0, learning_rate=0.3, objective="binary:bce", - num_class=3, + num_class=1, max_bin=32, l2=0.1, l1=0, @@ -213,6 +221,10 @@ def get_task_info(self): classes = [0, 1] elif task_type == REGRESSION: classes = None + elif task_type == MULTI: + classes = [i for i in range(self.num_class)] + else: + raise RuntimeError(f"unknown task type {task_type}") return task_type, classes def fit(self, ctx: Context, train_data: DataFrame, validate_data: DataFrame = None) -> None: @@ -279,8 +291,10 @@ def fit(self, ctx: Context, train_data: DataFrame, validate_data: DataFrame = No assert len(scores) == len( self._accumulate_scores ), f"tree predict scores length {len(scores)} not equal to accumulate scores length {len(self._accumulate_scores)}." - scores = scores.loc(self._accumulate_scores.get_indexer(target="sample_id"), preserve_order=True) - self._accumulate_scores = self._accumulate_scores + scores * self.learning_rate + self._accumulate_scores = _accumulate_scores( + self._accumulate_scores, scores, self.learning_rate, self.objective == MULTI_CE, class_num=self.num_class, + dim=tree_dim + ) self._trees.append(tree) self._saved_tree.append(tree.get_model()) self._update_feature_importance(tree.get_feature_importance()) @@ -314,7 +328,10 @@ def predict(self, ctx: Context, predict_data: DataFrame, predict_leaf=False, ret leaf_pos = predict_leaf_guest(ctx, self._trees, predict_data) if predict_leaf: return leaf_pos - result = self._sum_leaf_weights(leaf_pos, self._trees, self.learning_rate, self._loss_func) + result = self._sum_leaf_weights(leaf_pos, self._trees, self.learning_rate, self._loss_func, + num_dim=self._tree_dim) + + print('result df is {}'.format(result.as_pd_df())) if task_type == REGRESSION: logger.debug("regression task, add init score") diff --git a/python/fate/ml/ensemble/algo/secureboost/test/test_hetero_sbt_multi.py b/python/fate/ml/ensemble/algo/secureboost/test/test_hetero_sbt_multi.py index 269454bdb1..99f19091b2 100644 --- a/python/fate/ml/ensemble/algo/secureboost/test/test_hetero_sbt_multi.py +++ b/python/fate/ml/ensemble/algo/secureboost/test/test_hetero_sbt_multi.py @@ -41,9 +41,11 @@ def create_ctx(local): if __name__ == "__main__": party = sys.argv[1] - max_depth = 3 + max_depth = 2 num_tree = 1 - from sklearn.metrics import roc_auc_score as auc + + # import acc from sklearn + from sklearn.metrics import accuracy_score as acc if party == "guest": ctx = create_ctx(guest) @@ -52,19 +54,23 @@ def create_ctx(local): reader = PandasReader(sample_id_name="sample_id", match_id_name="id", label_name="y", dtype="float32") data_guest = reader.to_frame(ctx, df) - + print('num tree is {}'.format(num_tree)) trees = HeteroSecureBoostGuest( num_tree, max_depth=max_depth, num_class=4, objective="multi:ce" ) trees.fit(ctx, data_guest) - # pred = trees.get_train_predict().as_pd_df() - # pred['sample_id'] = pred.sample_id.astype(int) + pred = trees.get_train_predict().as_pd_df() + pred['sample_id'] = pred.sample_id.astype(int) + true_label = pred['label'] + pred_label = pred['predict_result'] + acc_ = acc(true_label, pred_label) + print('acc is {}'.format(acc_)) # df = pd.merge(df, pred, on='sample_id') # load tree # tree_dict = pickle.load(open('guest_tree.pkl', 'rb')) # trees.from_model(tree_dict) - # pred_ = trees.predict(ctx, data_guest).as_pd_df() + pred_ = trees.predict(ctx, data_guest).as_pd_df() # print(auc(df.y, df.score)) # print(auc(pred_.label, pred_.predict_score)) # pred_.sample_id = pred_.sample_id.astype(int) @@ -87,7 +93,7 @@ def create_ctx(local): # load tree # tree_dict = pickle.load(open('host_tree.pkl', 'rb')) # trees.from_model(tree_dict) - # trees.predict(ctx, data_host) + trees.predict(ctx, data_host) # fit again # new_tree = HeteroSecureBoostHost(1, max_depth=3) diff --git a/python/fate/ml/ensemble/learner/decision_tree/tree_core/loss.py b/python/fate/ml/ensemble/learner/decision_tree/tree_core/loss.py index 7918dddb93..79c180f66d 100644 --- a/python/fate/ml/ensemble/learner/decision_tree/tree_core/loss.py +++ b/python/fate/ml/ensemble/learner/decision_tree/tree_core/loss.py @@ -105,12 +105,10 @@ def compute_grad(gh: DataFrame, label: DataFrame, score: DataFrame): label_name = label.schema.label_name label = label.loc(score.get_indexer('sample_id'), preserve_order=True) - print('len g {} len label {} len score {}'.format(len(gh), len(label), len(score))) new_label = label.create_frame() new_label[label_name] = label.label stack_df = DataFrame.hstack([score, new_label]) stack_df = stack_df.loc(gh.get_indexer('sample_id'), preserve_order=True) - print('cwj', stack_df.as_pd_df()) def grad(s): grads = [i for i in s["score"]] grads[s[label_name]] -= 1 diff --git a/python/fate/ml/ensemble/learner/decision_tree/tree_core/splitter.py b/python/fate/ml/ensemble/learner/decision_tree/tree_core/splitter.py index 318f47309d..147c35199b 100644 --- a/python/fate/ml/ensemble/learner/decision_tree/tree_core/splitter.py +++ b/python/fate/ml/ensemble/learner/decision_tree/tree_core/splitter.py @@ -116,15 +116,15 @@ def truncate(f, n=TREE_DECIMAL_ROUND): return np.floor(f * 10 ** n) / 10 ** n def node_gain(self, g, h): + g, h = self.truncate(g), self.truncate(h) if isinstance(h, np.ndarray): h[h == 0] = np.nan score = g * g / (h + self.l2) return score def node_weight(self, sum_grad, sum_hess): - weight = -(sum_grad / (sum_hess + self.l2)) - return weight + return self.truncate(weight) def _extract_hist(self, histogram, pack_info=None): tensor_hist: dict = histogram.extract_data() @@ -179,7 +179,6 @@ def _compute_min_leaf_mask(self, l_cnt, r_cnt): def _compute_gains(self, g, h, cnt, g_sum, h_sum, cnt_sum, hist_mask=None): l_g, l_h, l_cnt = g, h, cnt - r_g, r_h = g_sum - l_g, h_sum - l_h r_cnt = cnt_sum - l_cnt @@ -196,10 +195,10 @@ def _compute_gains(self, g, h, cnt, g_sum, h_sum, cnt_sum, hist_mask=None): mask = union_mask_0 mask = torch.logical_or(mask, union_mask_1) rs = self.node_gain(l_g, l_h) + self.node_gain(r_g, r_h) - self.node_gain(g_sum, h_sum) + rs = self.truncate(rs) rs[torch.isnan(rs)] = float("-inf") rs[rs < self.min_impurity_split] = float("-inf") rs[mask] = float("-inf") - return rs def _find_best_splits( diff --git a/python/fate/ml/utils/predict_tools.py b/python/fate/ml/utils/predict_tools.py index f73cf48ae6..7b38869a47 100644 --- a/python/fate/ml/utils/predict_tools.py +++ b/python/fate/ml/utils/predict_tools.py @@ -87,7 +87,6 @@ def compute_predict_details(dataframe: DataFrame, task_type: Literal['binary', ' enable_type_align_checking=False) elif task_type == MULTI: - def handle_multi(v: pd.Series): predict_result = np.argmax(v[PREDICT_SCORE]) assert len(v[PREDICT_SCORE]) == len(classes), 'predict score length is not equal to classes length,\