Skip to content

Commit

Permalink
fix median & allow quantile in statistics(#4663)
Browse files Browse the repository at this point in the history
edit lr bq examples(#5008)

Signed-off-by: Yu Wu <yolandawu131@gmail.com>
  • Loading branch information
nemirorox committed Aug 14, 2023
1 parent b643c22 commit 8afbae4
Show file tree
Hide file tree
Showing 12 changed files with 75 additions and 37 deletions.
8 changes: 4 additions & 4 deletions examples/benchmark_quality/lr/default_credit_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ data_guest: "default_credit_hetero_guest"
data_host: "default_credit_hetero_host"
idx: "id"
label_name: "y"
epochs: 30
epochs: 16
init_param:
fit_intercept: True
method: "zeros"
Expand All @@ -15,8 +15,8 @@ learning_rate_scheduler:
optimizer:
method: "rmsprop"
penalty: "L2"
alpha: 0.001
alpha: 0.01
optimizer_params:
lr: 0.17
batch_size: 3200
lr: 0.22
batch_size: 2000
early_stop: "diff"
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ fit_intercept: True
method: "rmsprop"
penalty: "L2"
eta0: 0.1
alpha: 0.5
alpha: 0.05
batch_size: 5000
2 changes: 1 addition & 1 deletion examples/benchmark_quality/lr/give_credit_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ optimizer:
alpha: 0.01
optimizer_params:
lr: 0.25
batch_size: 5500
batch_size: null
early_stop: "diff"
36 changes: 18 additions & 18 deletions examples/benchmark_quality/lr/lr_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -206,21 +206,21 @@ hetero_lr-binary-1-default-credit:
# conf: "./epsilon_5k_config.yaml"
# compare_setting:
# relative_tol: 0.01
hetero_lr-binary-3-give-credit:
local:
script: "./sklearn-lr-binary.py"
conf: "./give_credit_lr_sklearn_config.yaml"
FATE-hetero-lr:
script: "./pipeline-lr-binary.py"
conf: "./give_credit_config.yaml"
compare_setting:
relative_tol: 0.01
multi-vehicle:
local:
script: "./sklearn-lr-multi.py"
conf: "./vehicle_lr_sklearn_config.yaml"
FATE-hetero-lr:
script: "./pipeline-lr-multi.py"
conf: "./vehicle_config.yaml"
compare_setting:
relative_tol: 0.01
#hetero_lr-binary-3-give-credit:
# local:
# script: "./sklearn-lr-binary.py"
# conf: "./give_credit_lr_sklearn_config.yaml"
# FATE-hetero-lr:
# script: "./pipeline-lr-binary.py"
# conf: "./give_credit_config.yaml"
# compare_setting:
# relative_tol: 0.01
#multi-vehicle:
# local:
# script: "./sklearn-lr-multi.py"
# conf: "./vehicle_lr_sklearn_config.yaml"
# FATE-hetero-lr:
# script: "./pipeline-lr-multi.py"
# conf: "./vehicle_config.yaml"
# compare_setting:
# relative_tol: 0.01
1 change: 0 additions & 1 deletion examples/benchmark_quality/lr/pipeline-lr-binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ def main(config="../../config.yaml", param="./breast_config.yaml", namespace="")
if config.timeout:
pipeline.conf.set("timeout", config.timeout)
pipeline.compile()
print(pipeline.get_dag())
pipeline.fit()

lr_0_data = pipeline.get_task_info("lr_0").get_output_data()["train_output_data"]
Expand Down
1 change: 0 additions & 1 deletion examples/benchmark_quality/lr/pipeline-lr-multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ def main(config="../../config.yaml", param="./vehicle_config.yaml", namespace=""
pipeline.conf.set("timeout", config.timeout)

pipeline.compile()
print(pipeline.get_dag())
pipeline.fit()

lr_0_data = pipeline.get_component("lr_0").get_output_data()["train_output_data"]
Expand Down
2 changes: 1 addition & 1 deletion examples/benchmark_quality/lr/sklearn-lr-binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def main(config="../../config.yaml", param="./breast_lr_sklearn_config.yaml"):
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

ks = max(tpr - fpr)
result = {"auc": auc_score, "recall": recall, "binary_precision": pr, "accuracy": acc}
result = {"auc": auc_score, "recall": recall, "precision": pr, "accuracy": acc}
print(result)
print(f"coef_: {lm_fit.coef_}, intercept_: {lm_fit.intercept_}, n_iter: {lm_fit.n_iter_}")
return {}, result
Expand Down
4 changes: 2 additions & 2 deletions examples/pipeline/statistics/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ def main(config=".../config.yaml", namespace=""):
namespace=f"experiment{namespace}"))

statistics_0 = Statistics("statistics_0", input_data=psi_0.outputs["output_data"],
metrics=["mean", "std", "min", "max"])
metrics=["mean", "std", "min", "max", "25%", "median", "75%"])

pipeline.add_task(psi_0)
pipeline.add_task(statistics_0)

# pipeline.add_task(hetero_feature_binning_0)
pipeline.compile()
print(pipeline.get_dag())
pipeline.fit()
# print(f"statistics_0 output model: {pipeline.get_task_info('statistics_0').get_output_model()}")


if __name__ == "__main__":
Expand Down
7 changes: 5 additions & 2 deletions python/fate/components/components/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def statistics(
role: Role,
input_data: cpn.dataframe_input(roles=[GUEST, HOST]),
metrics: cpn.parameter(
type=Union[List[params.statistic_metrics_param()], params.statistic_metrics_param()],
type=Union[List[Union[params.statistic_metrics_param(), params.legal_percentile()]],
params.statistic_metrics_param(), params.legal_percentile()],
default=["mean", "std", "min", "max"],
desc="metrics to be computed, default ['count', 'mean', 'std', 'min', 'max']",
),
Expand All @@ -37,6 +38,8 @@ def statistics(
default=True,
desc="If False, the calculations of skewness and kurtosis are corrected for statistical bias.",
),
relative_error: cpn.parameter(type=params.confloat(gt=0, le=1), default=1e-3,
desc="float, error rate for quantile"),
skip_col: cpn.parameter(
type=List[str],
default=None,
Expand All @@ -60,7 +63,7 @@ def statistics(
for metric in metrics:
if metric == "describe":
raise ValueError(f"'describe' should not be combined with additional metric names.")
stat_computer = FeatureStatistics(list(set(metrics)), ddof, bias)
stat_computer = FeatureStatistics(list(set(metrics)), ddof, bias, relative_error)
input_data = input_data[select_cols]
stat_computer.fit(sub_ctx, input_data)

Expand Down
2 changes: 1 addition & 1 deletion python/fate/components/core/params/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@
)
from ._init_param import InitParam, init_param
from ._learning_rate import LRSchedulerParam, lr_scheduler_param
from ._metrics import metrics_param, statistic_metrics_param
from ._metrics import metrics_param, statistic_metrics_param, legal_percentile
from ._optimizer import OptimizerParam, optimizer_param
from ._penalty import penalty_param
24 changes: 23 additions & 1 deletion python/fate/components/core/params/_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from typing import Type

from ._fields import StringChoice
from ._fields import StringChoice, Parameter


class Metrics(StringChoice):
Expand Down Expand Up @@ -68,3 +69,24 @@ def metrics_param(auc=True, ks=True, accuracy=True, mse=True) -> Type[str]:
choice={k for k, v in choice.items() if v},
)
return type("Metrics", (Metrics,), namespace)


class LegalPercentile(str, Parameter):
legal_percentile = r"^(100)|(?:[05]|[0-9]?[05])0*%$"

@classmethod
def __get_validators__(cls):
yield cls.percentile_validator

@classmethod
def percentile_validator(cls, v):
if re.match(cls.legal_percentile, v):
return v
raise ValueError(f"provided `{v}` not in legal percentile format")


def legal_percentile() -> Type[str]:
namespace = dict(
legal_percentile=LegalPercentile.legal_percentile,
)
return type("LegalPercentile", (LegalPercentile,), namespace)
23 changes: 19 additions & 4 deletions python/fate/ml/statistics/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.

import logging
import re
from typing import List

import pandas as pd
Expand All @@ -25,9 +26,9 @@


class FeatureStatistics(Module):
def __init__(self, metrics: List[str] = None, ddof=1, bias=True):
def __init__(self, metrics: List[str] = None, ddof=1, bias=True, relative_error=1e-3):
self.metrics = metrics
self.summary = StatisticsSummary(ddof, bias)
self.summary = StatisticsSummary(ddof, bias, relative_error)

def fit(self, ctx: Context, input_data, validate_data=None) -> None:
self.summary.compute_metrics(input_data, self.metrics)
Expand All @@ -49,28 +50,39 @@ def from_model(cls, model) -> "FeatureStatistics":


class StatisticsSummary(Module):
def __init__(self, ddof=1, bias=True):
def __init__(self, ddof=1, bias=True, relative_error=1e-3):
"""if metrics is not None:
if len(metrics) == 1 and metrics[0] == "describe":
self.inner_metric_names = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
else:
self.inner_metric_names = metrics"""
self.ddof = ddof
self.bias = bias
self.relative_error = relative_error
self.inner_metric_names = []
self.metrics_summary = None
self._count = None
self._nan_count = None
self._mean = None
self._describe = None
self._quantile = None
self._q_pts = None

def get_from_describe(self, data, metric):
if self._describe is None:
self._describe = data.describe(ddof=self.ddof, unbiased=~self.bias)
return self._describe[metric]

def get_from_quantile_summary(self, data, metric):
query_q = int(metric[:-1]) / 100
if self._quantile is None:
self._quantile = data.quantile(q=self._q_pts, relative_error=self.relative_error)
return self._quantile.loc[query_q]

def compute_metrics(self, data, metrics):
res = pd.DataFrame(columns=data.schema.columns)
q_metrics = [metric for metric in metrics if re.match(r"^(100|\d{1,2})%$", metric)]
self._q_pts = [int(metric[:-1]) / 100 for metric in q_metrics]
for metric in metrics:
metric_val = None
"""if metric == "describe":
Expand All @@ -80,12 +92,15 @@ def compute_metrics(self, data, metrics):
return"""
if metric in ["sum", "min", "max", "mean", "std", "var"]:
metric_val = self.get_from_describe(data, metric)
if metric in q_metrics:
metric_val = self.get_from_quantile_summary(data, metric)
elif metric == "count":
if self._count is None:
self._count = data.count()
metric_val = self._count
elif metric == "median":
metric_val = data.median()
metric_val = data.quantile(q=0.5, relative_error=self.relative_error)
metric_val = metric_val.loc[0.5]
elif metric == "coefficient_of_variation":
metric_val = self.get_from_describe(data, "variation")
elif metric == "missing_count":
Expand Down

0 comments on commit 8afbae4

Please sign in to comment.