Skip to content

Commit

Permalink
Add support for multi-classi-task
Browse files Browse the repository at this point in the history
Update components
Signed-off-by: weijingchen <talkingwallace@sohu.com>

Signed-off-by: cwj <talkingwallace@sohu.com>
  • Loading branch information
talkingwallace committed Oct 10, 2023
1 parent f47cca1 commit ffecd40
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 35 deletions.
2 changes: 1 addition & 1 deletion python/fate/components/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def homo_lr(self):

@_lazy_cpn
def hetero_sbt(self):
from .hetero_sbt import hetero_sbt
from .hetero_secureboost import hetero_sbt

return hetero_sbt

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import logging

from fate.arch import Context
from fate.components.components.utils import consts
from fate.components.core import GUEST, HOST, Role, cpn, params
from fate.ml.ensemble import HeteroSecureBoostGuest, HeteroSecureBoostHost, BINARY_BCE, MULTI_CE, REGRESSION_L2
from fate.components.components.utils.tools import add_dataset_type
Expand All @@ -41,6 +40,8 @@ def train(
desc="max tree num"),
learning_rate: cpn.parameter(type=params.confloat(gt=0), default=0.3, desc='decay factor of each tree'),
max_depth: cpn.parameter(type=params.conint(gt=0), default=3, desc='max depth of a tree'),
complete_secure: cpn.parameter(type=params.conint(ge=0), default=0, desc='number of trees to use guest features only in the complete secure mode, '
'0 means no complete secure'),
max_bin: cpn.parameter(type=params.conint(gt=0), default=32, desc='max bin number of feature binning'),
objective: cpn.parameter(type=params.string_choice(choice=[BINARY_BCE, MULTI_CE, REGRESSION_L2]), default=BINARY_BCE, \
desc='objective function, available: {}'.format([BINARY_BCE, MULTI_CE, REGRESSION_L2])),
Expand Down Expand Up @@ -71,12 +72,13 @@ def train(
# initialize encrypt kit
ctx.cipher.set_phe(ctx.device, he_param.dict())

booster = HeteroSecureBoostGuest(num_trees=num_trees, max_depth=max_depth, learning_rate=learning_rate, max_bin=max_bin,
booster = HeteroSecureBoostGuest(num_trees=num_trees, max_depth=max_depth, complete_secure=complete_secure,
learning_rate=learning_rate, max_bin=max_bin,
l2=l2, min_impurity_split=min_impurity_split, min_sample_split=min_sample_split,
min_leaf_node=min_leaf_node, min_child_weight=min_child_weight, objective=objective, num_class=num_class,
gh_pack=gh_pack, split_info_pack=split_info_pack, hist_sub=hist_sub
)
if train_model_input is not None:
if train_model_input:
booster.from_model(train_model_input)
logger.info('sbt input model loaded, will start warmstarting')
booster.fit(ctx, train_data, validate_data)
Expand All @@ -90,7 +92,8 @@ def train(

elif role.is_host:

booster = HeteroSecureBoostHost(num_trees=num_trees, max_depth=max_depth, learning_rate=learning_rate, max_bin=max_bin, hist_sub=hist_sub)
booster = HeteroSecureBoostHost(num_trees=num_trees, max_depth=max_depth, complete_secure=complete_secure,
max_bin=max_bin, hist_sub=hist_sub)
if train_model_input is not None:
booster.from_model(train_model_input)
logger.info('sbt input model loaded, will start warmstarting')
Expand Down
19 changes: 11 additions & 8 deletions python/fate/ml/ensemble/algo/secureboost/hetero/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,20 @@ def _update_feature_importance(self, fi_dict: Dict[int, FeatureImportance]):
else:
self._global_feature_importance[fid] = self._global_feature_importance[fid] + fi

def _sum_leaf_weights(self, leaf_pos: DataFrame, trees, learing_rate: float, loss_func):
def _compute_score(leaf_pos: np.array, trees: List[List[Node]], learning_rate: float):
score = 0
leaf_pos = leaf_pos["sample_pos"]
for node_idx, tree in zip(leaf_pos, trees):
def _sum_leaf_weights(self, leaf_pos: DataFrame, trees, learing_rate: float, loss_func, num_dim=1):
def _compute_score(leaf_pos_: np.array, trees_: List[List[Node]], learning_rate: float, num_dim_=1):
score = np.zeros(num_dim_)
leaf_pos_ = leaf_pos_["sample_pos"]
tree_idx = 0
for node_idx, tree in zip(leaf_pos_, trees_):
recovered_idx = -(node_idx + 1)
score += tree[recovered_idx].weight * learning_rate
return score
score[tree_idx % num_dim_] += tree[recovered_idx].weight * learning_rate
tree_idx += 1

return float(score[0]) if num_dim_ == 1 else [score]

tree_list = [tree.get_nodes() for tree in trees]
apply_func = functools.partial(_compute_score, trees=tree_list, learning_rate=learing_rate)
apply_func = functools.partial(_compute_score, trees_=tree_list, learning_rate=learing_rate, num_dim_=num_dim)
predict_score = leaf_pos.create_frame()
predict_score["score"] = leaf_pos.apply_row(apply_func)
return loss_func.predict(predict_score)
Expand Down
31 changes: 24 additions & 7 deletions python/fate/ml/ensemble/algo/secureboost/hetero/guest.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,23 @@ def select_func(s: pd.Series, idx):
return target_gh


def _accumulate_scores(acc_scores: DataFrame, new_scores: DataFrame, learning_rate: float, multi_class=False):
def _accumulate_scores(acc_scores: DataFrame, new_scores: DataFrame, learning_rate: float,
multi_class=False, class_num=None, dim=0):

def _extend_score(s: pd.Series, class_num, dim):
new_s = pd.Series()
new_s['score'] = np.zeros(class_num)
new_s['score'][dim] = s['score']
return new_s

new_scores = new_scores.loc(acc_scores.get_indexer(target="sample_id"), preserve_order=True)
if not multi_class:
acc_scores = acc_scores + new_scores * learning_rate
else:
pass
extend_scores = new_scores.apply_row(lambda s: _extend_score(s, class_num, dim), columns=["score"])
acc_scores = acc_scores + extend_scores * learning_rate
return acc_scores


class HeteroSecureBoostGuest(HeteroBoostingTree):
def __init__(
self,
Expand All @@ -84,7 +92,7 @@ def __init__(
complete_secure=0,
learning_rate=0.3,
objective="binary:bce",
num_class=3,
num_class=1,
max_bin=32,
l2=0.1,
l1=0,
Expand Down Expand Up @@ -213,6 +221,10 @@ def get_task_info(self):
classes = [0, 1]
elif task_type == REGRESSION:
classes = None
elif task_type == MULTI:
classes = [i for i in range(self.num_class)]
else:
raise RuntimeError(f"unknown task type {task_type}")
return task_type, classes

def fit(self, ctx: Context, train_data: DataFrame, validate_data: DataFrame = None) -> None:
Expand Down Expand Up @@ -279,8 +291,10 @@ def fit(self, ctx: Context, train_data: DataFrame, validate_data: DataFrame = No
assert len(scores) == len(
self._accumulate_scores
), f"tree predict scores length {len(scores)} not equal to accumulate scores length {len(self._accumulate_scores)}."
scores = scores.loc(self._accumulate_scores.get_indexer(target="sample_id"), preserve_order=True)
self._accumulate_scores = self._accumulate_scores + scores * self.learning_rate
self._accumulate_scores = _accumulate_scores(
self._accumulate_scores, scores, self.learning_rate, self.objective == MULTI_CE, class_num=self.num_class,
dim=tree_dim
)
self._trees.append(tree)
self._saved_tree.append(tree.get_model())
self._update_feature_importance(tree.get_feature_importance())
Expand Down Expand Up @@ -314,7 +328,10 @@ def predict(self, ctx: Context, predict_data: DataFrame, predict_leaf=False, ret
leaf_pos = predict_leaf_guest(ctx, self._trees, predict_data)
if predict_leaf:
return leaf_pos
result = self._sum_leaf_weights(leaf_pos, self._trees, self.learning_rate, self._loss_func)
result = self._sum_leaf_weights(leaf_pos, self._trees, self.learning_rate, self._loss_func,
num_dim=self._tree_dim)

print('result df is {}'.format(result.as_pd_df()))

if task_type == REGRESSION:
logger.debug("regression task, add init score")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ def create_ctx(local):

if __name__ == "__main__":
party = sys.argv[1]
max_depth = 3
max_depth = 2
num_tree = 1
from sklearn.metrics import roc_auc_score as auc

# import acc from sklearn
from sklearn.metrics import accuracy_score as acc

if party == "guest":
ctx = create_ctx(guest)
Expand All @@ -52,19 +54,23 @@ def create_ctx(local):
reader = PandasReader(sample_id_name="sample_id", match_id_name="id", label_name="y", dtype="float32")

data_guest = reader.to_frame(ctx, df)

print('num tree is {}'.format(num_tree))
trees = HeteroSecureBoostGuest(
num_tree, max_depth=max_depth, num_class=4, objective="multi:ce"
)
trees.fit(ctx, data_guest)
# pred = trees.get_train_predict().as_pd_df()
# pred['sample_id'] = pred.sample_id.astype(int)
pred = trees.get_train_predict().as_pd_df()
pred['sample_id'] = pred.sample_id.astype(int)
true_label = pred['label']
pred_label = pred['predict_result']
acc_ = acc(true_label, pred_label)
print('acc is {}'.format(acc_))
# df = pd.merge(df, pred, on='sample_id')

# load tree
# tree_dict = pickle.load(open('guest_tree.pkl', 'rb'))
# trees.from_model(tree_dict)
# pred_ = trees.predict(ctx, data_guest).as_pd_df()
pred_ = trees.predict(ctx, data_guest).as_pd_df()
# print(auc(df.y, df.score))
# print(auc(pred_.label, pred_.predict_score))
# pred_.sample_id = pred_.sample_id.astype(int)
Expand All @@ -87,7 +93,7 @@ def create_ctx(local):
# load tree
# tree_dict = pickle.load(open('host_tree.pkl', 'rb'))
# trees.from_model(tree_dict)
# trees.predict(ctx, data_host)
trees.predict(ctx, data_host)

# fit again
# new_tree = HeteroSecureBoostHost(1, max_depth=3)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,10 @@ def compute_grad(gh: DataFrame, label: DataFrame, score: DataFrame):

label_name = label.schema.label_name
label = label.loc(score.get_indexer('sample_id'), preserve_order=True)
print('len g {} len label {} len score {}'.format(len(gh), len(label), len(score)))
new_label = label.create_frame()
new_label[label_name] = label.label
stack_df = DataFrame.hstack([score, new_label])
stack_df = stack_df.loc(gh.get_indexer('sample_id'), preserve_order=True)
print('cwj', stack_df.as_pd_df())
def grad(s):
grads = [i for i in s["score"]]
grads[s[label_name]] -= 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,15 @@ def truncate(f, n=TREE_DECIMAL_ROUND):
return np.floor(f * 10 ** n) / 10 ** n

def node_gain(self, g, h):
g, h = self.truncate(g), self.truncate(h)
if isinstance(h, np.ndarray):
h[h == 0] = np.nan
score = g * g / (h + self.l2)
return score

def node_weight(self, sum_grad, sum_hess):

weight = -(sum_grad / (sum_hess + self.l2))
return weight
return self.truncate(weight)

def _extract_hist(self, histogram, pack_info=None):
tensor_hist: dict = histogram.extract_data()
Expand Down Expand Up @@ -179,7 +179,6 @@ def _compute_min_leaf_mask(self, l_cnt, r_cnt):

def _compute_gains(self, g, h, cnt, g_sum, h_sum, cnt_sum, hist_mask=None):
l_g, l_h, l_cnt = g, h, cnt

r_g, r_h = g_sum - l_g, h_sum - l_h
r_cnt = cnt_sum - l_cnt

Expand All @@ -196,10 +195,10 @@ def _compute_gains(self, g, h, cnt, g_sum, h_sum, cnt_sum, hist_mask=None):
mask = union_mask_0
mask = torch.logical_or(mask, union_mask_1)
rs = self.node_gain(l_g, l_h) + self.node_gain(r_g, r_h) - self.node_gain(g_sum, h_sum)
rs = self.truncate(rs)
rs[torch.isnan(rs)] = float("-inf")
rs[rs < self.min_impurity_split] = float("-inf")
rs[mask] = float("-inf")

return rs

def _find_best_splits(
Expand Down
1 change: 0 additions & 1 deletion python/fate/ml/utils/predict_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ def compute_predict_details(dataframe: DataFrame, task_type: Literal['binary', '
enable_type_align_checking=False)

elif task_type == MULTI:

def handle_multi(v: pd.Series):
predict_result = np.argmax(v[PREDICT_SCORE])
assert len(v[PREDICT_SCORE]) == len(classes), 'predict score length is not equal to classes length,\
Expand Down

0 comments on commit ffecd40

Please sign in to comment.