From 21a7d82d2ae0aff9a7eb9138b27ab9c52ff9ade6 Mon Sep 17 00:00:00 2001
From: Yu Wu <yolandawu131@gmail.com>
Date: Fri, 8 Sep 2023 13:01:28 +0800
Subject: [PATCH] fix federated metrics computation with categorical
 features(#4660) disable cumsum for event count(#4660) update he param

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
---
 .../lr/pipeline-lr-binary.py                  |  7 +++---
 .../fate/components/core/params/_he_param.py  |  2 +-
 .../feature_binning/hetero_feature_binning.py | 24 +++++++++----------
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/examples/benchmark_quality/lr/pipeline-lr-binary.py b/examples/benchmark_quality/lr/pipeline-lr-binary.py
index 7446a760bd..6afb78b25c 100644
--- a/examples/benchmark_quality/lr/pipeline-lr-binary.py
+++ b/examples/benchmark_quality/lr/pipeline-lr-binary.py
@@ -16,12 +16,13 @@
 
 import argparse
 
+from fate_test.utils import parse_summary_result
+
 from fate_client.pipeline import FateFlowPipeline
 from fate_client.pipeline.components.fate import CoordinatedLR, PSI
 from fate_client.pipeline.components.fate import Evaluation
 from fate_client.pipeline.interface import DataWarehouseChannel
 from fate_client.pipeline.utils import test_utils
-from fate_test.utils import extract_data, parse_summary_result
 
 
 def main(config="../../config.yaml", param="./breast_config.yaml", namespace=""):
@@ -88,14 +89,14 @@ def main(config="../../config.yaml", param="./breast_config.yaml", namespace="")
     pipeline.compile()
     pipeline.fit()
 
-    lr_0_data = pipeline.get_task_info("lr_0").get_output_data()["train_output_data"]
+    """lr_0_data = pipeline.get_task_info("lr_0").get_output_data()["train_output_data"]
     lr_1_data = pipeline.get_task_info("lr_1").get_output_data()["test_output_data"]
     lr_0_score = extract_data(lr_0_data, "predict_result")
     lr_0_label = extract_data(lr_0_data, "y")
     lr_1_score = extract_data(lr_1_data, "predict_result")
     lr_1_label = extract_data(lr_1_data, "y")
     lr_0_score_label = extract_data(lr_0_data, "predict_result", keep_id=True)
-    lr_1_score_label = extract_data(lr_1_data, "predict_result", keep_id=True)
+    lr_1_score_label = extract_data(lr_1_data, "predict_result", keep_id=True)"""
 
     result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_output_metric()[0]["data"])
     print(f"result_summary: {result_summary}")
diff --git a/python/fate/components/core/params/_he_param.py b/python/fate/components/core/params/_he_param.py
index b9b116b8c5..d7abd1cebb 100644
--- a/python/fate/components/core/params/_he_param.py
+++ b/python/fate/components/core/params/_he_param.py
@@ -20,7 +20,7 @@
 
 
 class HEParam(pydantic.BaseModel):
-    kind: string_choice(["paillier"])
+    kind: string_choice(["paillier", "ou", "mock"])
     key_length: int = 1024
 
 
diff --git a/python/fate/ml/feature_binning/hetero_feature_binning.py b/python/fate/ml/feature_binning/hetero_feature_binning.py
index 516459f9c3..e8a069012e 100644
--- a/python/fate/ml/feature_binning/hetero_feature_binning.py
+++ b/python/fate/ml/feature_binning/hetero_feature_binning.py
@@ -182,12 +182,12 @@ def compute_federated_metrics(self, ctx: Context, binned_data):
         coder = ctx.guest.get("coder")
         columns = binned_data.schema.columns.to_list()
         # logger.info(f"self.bin_col: {self.bin_col}")
-        anonymous_col_bin = [binned_data.schema.anonymous_columns[columns.index(col)] for col in self.bin_col]
+        to_compute_col = self.bin_col + self.category_col
+        anonymous_col_bin = [binned_data.schema.anonymous_columns[columns.index(col)] for col in to_compute_col]
 
         ctx.guest.put("anonymous_col_bin", anonymous_col_bin)
         encrypt_y = ctx.guest.get("enc_y")
         # event count:
-        to_compute_col = self.bin_col + self.category_col
         feature_bin_sizes = [self._bin_obj._bin_count_dict[col] for col in self.bin_col]
         if self.category_col:
             for col in self.category_col:
@@ -200,19 +200,20 @@ def compute_federated_metrics(self, ctx: Context, binned_data):
         hist_targets = binned_data.create_frame()
         hist_targets["event_count"] = encrypt_y
         hist_targets["non_event_count"] = 1
-        hist_schema = {"event_count": {"type": "paillier",
+        hist_schema = {"event_count": {"type": "ciphertext",
                                        "stride": 1,
                                        "pk": pk,
                                        "evaluator": evaluator,
                                        "coder": coder
                                        },
-                       "non_event_count": {"type": "tensor",
+                       "non_event_count": {"type": "plaintext",
                                            "stride": 1,
                                            "dtype": torch.int32}
                        }
         hist = HistogramBuilder(num_node=1,
                                 feature_bin_sizes=feature_bin_sizes,
-                                value_schemas=hist_schema)
+                                value_schemas=hist_schema,
+                                enable_cumsum=False)
         event_non_event_count_hist = to_compute_data.distributed_hist_stat(histogram_builder=hist,
                                                                            targets=hist_targets)
         event_non_event_count_hist.i_sub_on_key("non_event_count", "event_count")
@@ -293,7 +294,7 @@ def fit(self, ctx: Context, train_data, validate_data=None, skip_none=False):
 
         if self.method == "quantile":
             q = list(np.arange(0, 1, 1 / self.n_bins)) + [1.0]
-            split_pt_df = select_data.quantile(q=q, relative_error=self.relative_error)
+            split_pt_df = select_data.quantile(q=q, relative_error=self.relative_error).drop(0)
         elif self.method == "bucket":
             split_pt_df = select_data.qcut(q=self.n_bins)
         elif self.method == "manual":
@@ -311,7 +312,6 @@ def __get_col_bin_count(col):
         self._bin_count_dict = bin_count.to_dict()
 
     def bucketize_data(self, train_data):
-        # logger.debug(f"split pt dict: {self._split_pt_dict}")
         binned_df = train_data.bucketize(boundaries=self._split_pt_dict)
         return binned_df
 
@@ -328,11 +328,9 @@ def compute_all_col_metrics(self, event_non_event_count_hist, columns):
             col_event_count = pd.Series(
                 {bin_num: int(bin_count.data) for bin_num, bin_count in event_count_dict[col_name].items()}
             )
-            col_event_count = col_event_count - col_event_count.shift(1).fillna(0)
             col_non_event_count = pd.Series(
                 {bin_num: int(bin_count.data) for bin_num, bin_count in non_event_count_dict[col_name].items()}
             )
-            col_non_event_count = col_non_event_count - col_non_event_count.shift(1).fillna(0)
             if total_event_count is None:
                 total_event_count = col_event_count.sum() or 1
                 total_non_event_count = col_non_event_count.sum() or 1
@@ -377,17 +375,17 @@ def compute_metrics(self, binned_data):
         hist_targets = binned_data.create_frame()
         hist_targets["event_count"] = binned_data.label
         hist_targets["non_event_count"] = 1
-        hist_schema = {"event_count": {"type": "tensor",
+        hist_schema = {"event_count": {"type": "plaintext",
                                        "stride": 1,
                                        "dtype": torch.int32},
-                       "non_event_count": {"type": "tensor",
+                       "non_event_count": {"type": "plaintext",
                                            "stride": 1,
                                            "dtype": torch.int32}
                        }
         hist = HistogramBuilder(num_node=1,
                                 feature_bin_sizes=feature_bin_sizes,
-                                value_schemas=hist_schema)
-        df = to_compute_data.as_pd_df()
+                                value_schemas=hist_schema,
+                                enable_cumsum=False)
         event_non_event_count_hist = to_compute_data.distributed_hist_stat(histogram_builder=hist,
                                                                            targets=hist_targets)
         event_non_event_count_hist.i_sub_on_key("non_event_count", "event_count")