From 20b2c9fc6dee26f2b952210ed5f8058279f1e949 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Wed, 26 Jul 2023 16:37:24 +0800 Subject: [PATCH 01/30] add fate-test(#5008) Signed-off-by: Yu Wu --- examples/pipeline/coordinated_lr/config.yaml | 10 + .../coordinated_lr_testsuite.yaml | 42 ++ .../pipeline/coordinated_lr/test_lr_sid.py | 89 +++ .../pipeline/coordinated_lr/test_lr_sid_cv.py | 60 ++ .../coordinated_lr/test_lr_sid_warm_start.py | 90 +++ examples/pipeline/test_lr_sid.py | 78 --- examples/pipeline/test_lr_sid_cv.py | 38 -- examples/pipeline/test_lr_sid_warm_start.py | 81 --- .../pipeline/test_single_lr_multi_host.py | 93 +++ examples/pipeline/test_upload_sid.py | 16 +- python/fate_test/__init__.py | 0 python/fate_test/fate_test/__init__.py | 15 + python/fate_test/fate_test/_ascii.py | 48 ++ python/fate_test/fate_test/_client.py | 76 +++ python/fate_test/fate_test/_config.py | 269 ++++++++ python/fate_test/fate_test/_flow_client.py | 376 +++++++++++ python/fate_test/fate_test/_io.py | 70 +++ python/fate_test/fate_test/_parser.py | 587 ++++++++++++++++++ .../fate_test/fate_test/scripts/__init__.py | 15 + .../fate_test/fate_test/scripts/_options.py | 67 ++ python/fate_test/fate_test/scripts/_utils.py | 188 ++++++ .../fate_test/scripts/benchmark_cli.py | 151 +++++ python/fate_test/fate_test/scripts/cli.py | 67 ++ .../fate_test/fate_test/scripts/config_cli.py | 79 +++ .../fate_test/fate_test/scripts/data_cli.py | 435 +++++++++++++ .../fate_test/scripts/generate_mock_data.py | 345 ++++++++++ .../fate_test/scripts/performance_cli.py | 368 +++++++++++ .../fate_test/scripts/quick_test_cli.py | 95 +++ .../fate_test/scripts/testsuite_cli.py | 165 +++++ python/fate_test/fate_test/utils.py | 348 +++++++++++ python/fate_test/pyproject.toml | 44 ++ python/fate_test/setup.py | 40 ++ 32 files changed, 4240 insertions(+), 205 deletions(-) create mode 100644 examples/pipeline/coordinated_lr/config.yaml create mode 100644 examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml create mode 100644 examples/pipeline/coordinated_lr/test_lr_sid.py create mode 100644 examples/pipeline/coordinated_lr/test_lr_sid_cv.py create mode 100644 examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py delete mode 100644 examples/pipeline/test_lr_sid.py delete mode 100644 examples/pipeline/test_lr_sid_cv.py delete mode 100644 examples/pipeline/test_lr_sid_warm_start.py create mode 100644 examples/pipeline/test_single_lr_multi_host.py create mode 100644 python/fate_test/__init__.py create mode 100644 python/fate_test/fate_test/__init__.py create mode 100644 python/fate_test/fate_test/_ascii.py create mode 100644 python/fate_test/fate_test/_client.py create mode 100644 python/fate_test/fate_test/_config.py create mode 100644 python/fate_test/fate_test/_flow_client.py create mode 100644 python/fate_test/fate_test/_io.py create mode 100644 python/fate_test/fate_test/_parser.py create mode 100644 python/fate_test/fate_test/scripts/__init__.py create mode 100644 python/fate_test/fate_test/scripts/_options.py create mode 100644 python/fate_test/fate_test/scripts/_utils.py create mode 100644 python/fate_test/fate_test/scripts/benchmark_cli.py create mode 100644 python/fate_test/fate_test/scripts/cli.py create mode 100644 python/fate_test/fate_test/scripts/config_cli.py create mode 100644 python/fate_test/fate_test/scripts/data_cli.py create mode 100644 python/fate_test/fate_test/scripts/generate_mock_data.py create mode 100644 python/fate_test/fate_test/scripts/performance_cli.py create mode 100644 python/fate_test/fate_test/scripts/quick_test_cli.py create mode 100644 python/fate_test/fate_test/scripts/testsuite_cli.py create mode 100644 python/fate_test/fate_test/utils.py create mode 100644 python/fate_test/pyproject.toml create mode 100644 python/fate_test/setup.py diff --git a/examples/pipeline/coordinated_lr/config.yaml b/examples/pipeline/coordinated_lr/config.yaml new file mode 100644 index 0000000000..394a5b7802 --- /dev/null +++ b/examples/pipeline/coordinated_lr/config.yaml @@ -0,0 +1,10 @@ +parties: # parties default id + guest: + - 9999 + host: + - 9998 + - 9999 + arbiter: + - 9998 + +data_base_dir: "" # path to project base where data is located \ No newline at end of file diff --git a/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml b/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml new file mode 100644 index 0000000000..2de8a25b4f --- /dev/null +++ b/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml @@ -0,0 +1,42 @@ +data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host_sid + namespace: experiment + role: host_0 +tasks: + normal-lr: + script: test_lr_sid.py + lr-cv: + script: test_lr_sid_cv.py + lr-warm-start: + script: test_lr_sid_warm_start.py diff --git a/examples/pipeline/coordinated_lr/test_lr_sid.py b/examples/pipeline/coordinated_lr/test_lr_sid.py new file mode 100644 index 0000000000..9c7b31fb62 --- /dev/null +++ b/examples/pipeline/coordinated_lr/test_lr_sid.py @@ -0,0 +1,89 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="./config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + + intersect_0 = Intersection("intersect_0", method="raw") + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest_sid", + namespace=f"{namespace}experiment")) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host_sid", + namespace=f"{namespace}experiment")) + lr_0 = CoordinatedLR("lr_0", + epochs=4, + batch_size=None, + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, + init_param={"fit_intercept": True, "method": "zeros"}, + train_data=intersect_0.outputs["output_data"], + learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, + "total_iters": 100}}) + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="y", + runtime_roles=["guest"], + default_eval_setting="binary", + input_data=lr_0.outputs["train_output_data"]) + + pipeline.add_task(intersect_0) + pipeline.add_task(lr_0) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + pipeline.deploy([intersect_0, lr_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + deployed_pipeline.intersect_0.guest.component_setting( + input_data=DataWarehouseChannel(name="breast_hetero_guest_sid", + namespace=f"{namespace}experiment")) + deployed_pipeline.intersect_0.hosts[0].component_setting( + input_data=DataWarehouseChannel(name="breast_hetero_host_sid", + namespace=f"{namespace}experiment")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + # print(f"predict lr_0 data: {pipeline.get_task_info('lr_0').get_output_data()}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("-config", type=str, default="./config.yaml", + help="config file") + parser.add_argument("-namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/coordinated_lr/test_lr_sid_cv.py b/examples/pipeline/coordinated_lr/test_lr_sid_cv.py new file mode 100644 index 0000000000..badfed7a39 --- /dev/null +++ b/examples/pipeline/coordinated_lr/test_lr_sid_cv.py @@ -0,0 +1,60 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="./config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + + intersect_0 = Intersection("intersect_0", method="raw") + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest_sid", + namespace=f"{namespace}experiment")) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host_sid", + namespace=f"{namespace}experiment")) + lr_0 = CoordinatedLR("lr_0", + epochs=2, + batch_size=100, + optimizer={"method": "sgd", "optimizer_params": {"lr": 0.01}}, + init_param={"fit_intercept": True}, + cv_data=intersect_0.outputs["output_data"], + cv_param={"n_splits": 3}) + + pipeline.add_task(intersect_0) + pipeline.add_task(lr_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("-config", type=str, default="./config.yaml", + help="config file") + parser.add_argument("-namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py b/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py new file mode 100644 index 0000000000..b9bf8401ef --- /dev/null +++ b/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py @@ -0,0 +1,90 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="./config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + + intersect_0 = Intersection("intersect_0", method="raw") + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest_sid", + namespace=f"{namespace}experiment")) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host_sid", + namespace=f"{namespace}experiment")) + lr_0 = CoordinatedLR("lr_0", + epochs=4, + batch_size=None, + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, + init_param={"fit_intercept": True, "method": "zeros"}, + train_data=intersect_0.outputs["output_data"], + learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, + "total_iters": 100}}) + lr_1 = CoordinatedLR("lr_1", train_data=intersect_0.outputs["output_data"], + warm_start_model=lr_0.outputs["output_model"], + epochs=2, + batch_size=None, + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, + ) + + lr_2 = CoordinatedLR("lr_2", epochs=6, + batch_size=None, + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, + init_param={"fit_intercept": True, "method": "zeros"}, + train_data=intersect_0.outputs["output_data"], + learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, + "total_iters": 100}}) + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="y", + runtime_roles=["guest"], + default_eval_setting="binary", + input_data=[lr_1.outputs["train_output_data"], lr_2.outputs["train_output_data"]]) + + pipeline.add_task(intersect_0) + pipeline.add_task(lr_0) + pipeline.add_task(lr_1) + pipeline.add_task(lr_2) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + print(f"lr_1 model: {pipeline.get_task_info('lr_1').get_output_model()}") + # print(f"train lr_1 data: {pipeline.get_task_info('lr_1').get_output_data()}") + + print(f"lr_2 model: {pipeline.get_task_info('lr_2').get_output_model()}") + # print(f"train lr_2 data: {pipeline.get_task_info('lr_2').get_output_data()}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("-config", type=str, default="./config.yaml", + help="config file") + parser.add_argument("-namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/test_lr_sid.py b/examples/pipeline/test_lr_sid.py deleted file mode 100644 index e8569d3b1a..0000000000 --- a/examples/pipeline/test_lr_sid.py +++ /dev/null @@ -1,78 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, Intersection -from fate_client.pipeline.components.fate import Evaluation -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -intersect_0 = Intersection("intersect_0", method="raw") -intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) -intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) -lr_0 = CoordinatedLR("lr_0", - epochs=4, - batch_size=None, - optimizer={"method": "rprop", "optimizer_params": {"lr": 0.01}}, - init_param={"fit_intercept": True, "method": "zeros"}, - train_data=intersect_0.outputs["output_data"]) -lr_1 = CoordinatedLR("lr_1", test_data=intersect_0.outputs["output_data"], - input_model=lr_0.outputs["output_model"]) - -"""lr_0.guest.component_setting(train_data=DataWarehouseChannel(name="breast_hetero_guest_sid", - namespace="experiment")) -lr_0.hosts[0].component_setting(train_data=DataWarehouseChannel(name="breast_hetero_host_sid", - namespace="experiment"))""" - -evaluation_0 = Evaluation("evaluation_0", - label_column_name="y", - runtime_roles=["guest"], - default_eval_setting="binary", - input_data=lr_0.outputs["train_output_data"]) - -# pipeline.add_task(feature_scale_0) -# pipeline.add_task(feature_scale_1) -pipeline.add_task(intersect_0) -pipeline.add_task(lr_0) -# pipeline.add_task(evaluation_0) -# pipeline.add_task(hetero_feature_binning_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() -print(f"lr_0 model: {pipeline.get_task_info('lr_0').get_output_model()}") -print(f"train lr_0 data: {pipeline.get_task_info('lr_0').get_output_data()}") - -# print(pipeline.get_task_info("statistics_0").get_output_model()) -# print(f"evaluation metrics: ") -# print(pipeline.get_task_info("evaluation_0").get_output_metric()) - -pipeline.deploy([intersect_0, lr_0]) - -predict_pipeline = FateFlowPipeline() - -deployed_pipeline = pipeline.get_deployed_pipeline() -deployed_pipeline.intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) -deployed_pipeline.intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) - -predict_pipeline.add_task(deployed_pipeline) -predict_pipeline.compile() -# print("\n\n\n") -# print(predict_pipeline.compile().get_dag()) -predict_pipeline.predict() -print(f"predict lr_0 data: {pipeline.get_task_info('lr_0').get_output_data()}") diff --git a/examples/pipeline/test_lr_sid_cv.py b/examples/pipeline/test_lr_sid_cv.py deleted file mode 100644 index 2f136a1d60..0000000000 --- a/examples/pipeline/test_lr_sid_cv.py +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, Intersection -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -intersect_0 = Intersection("intersect_0", method="raw") -intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) -intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) -lr_0 = CoordinatedLR("lr_0", - epochs=2, - batch_size=100, - optimizer={"method": "sgd", "optimizer_params": {"lr": 0.01}}, - init_param={"fit_intercept": True}, - cv_data=intersect_0.outputs["output_data"], - cv_param={"n_splits": 3}) - -pipeline.add_task(intersect_0) -pipeline.add_task(lr_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() diff --git a/examples/pipeline/test_lr_sid_warm_start.py b/examples/pipeline/test_lr_sid_warm_start.py deleted file mode 100644 index bbd548313d..0000000000 --- a/examples/pipeline/test_lr_sid_warm_start.py +++ /dev/null @@ -1,81 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, Intersection -from fate_client.pipeline.components.fate import Evaluation -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -intersect_0 = Intersection("intersect_0", method="raw") -intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) -intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) -lr_0 = CoordinatedLR("lr_0", - epochs=3, - batch_size=None, - optimizer={"method": "sgd", "optimizer_params": {"lr": 0.01}}, - init_param={"fit_intercept": True, "method": "zeros"}, - train_data=intersect_0.outputs["output_data"]) -lr_1 = CoordinatedLR("lr_1", train_data=intersect_0.outputs["output_data"], - warm_start_model=lr_0.outputs["output_model"], - epochs=2, - batch_size=200) - -"""lr_0.guest.component_setting(train_data=DataWarehouseChannel(name="breast_hetero_guest_sid", - namespace="experiment")) -lr_0.hosts[0].component_setting(train_data=DataWarehouseChannel(name="breast_hetero_host_sid", - namespace="experiment"))""" - -evaluation_0 = Evaluation("evaluation_0", - runtime_roles=["guest"], - input_data=lr_0.outputs["train_output_data"]) - -# pipeline.add_task(feature_scale_0) -# pipeline.add_task(feature_scale_1) -pipeline.add_task(intersect_0) -pipeline.add_task(lr_0) -pipeline.add_task(lr_1) -# pipeline.add_task(evaluation_0) -# pipeline.add_task(hetero_feature_binning_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() -print(f"lr_0 model: {pipeline.get_task_info('lr_0').get_output_model()}") -# print(f"lr_0 data: {pipeline.get_task_info('lr_0').get_output_data()}") -print(f"\nlr_1 model: {pipeline.get_task_info('lr_1').get_output_model()}") - -"""# print(pipeline.get_task_info("statistics_0").get_output_model()) -print(pipeline.get_task_info("lr_0").get_output_model()) -print(pipeline.get_task_info("lr_0").get_output_metrics()) -print(f"evaluation metrics: ") -print(pipeline.get_task_info("evaluation_0").get_output_metrics()) - -pipeline.deploy([intersect_0, lr_0]) - -predict_pipeline = FateFlowPipeline() - -deployed_pipeline = pipeline.get_deployed_pipeline() -deployed_pipeline.intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) -deployed_pipeline.intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) - -predict_pipeline.add_task(deployed_pipeline) -predict_pipeline.compile() -# print("\n\n\n") -# print(predict_pipeline.compile().get_dag()) -predict_pipeline.predict()""" diff --git a/examples/pipeline/test_single_lr_multi_host.py b/examples/pipeline/test_single_lr_multi_host.py new file mode 100644 index 0000000000..cd332ad64e --- /dev/null +++ b/examples/pipeline/test_single_lr_multi_host.py @@ -0,0 +1,93 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="./config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host + arbiter = parties.arbiter[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + + intersect_0 = Intersection("intersect_0", method="raw") + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"{namespace}experiment_sid")) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"{namespace}experiment_sid")) + intersect_0.hosts[1].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"{namespace}experiment_sid")) + lr_0 = CoordinatedLR("lr_0", + epochs=4, + batch_size=None, + early_stop="weight_diff", + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, + init_param={"fit_intercept": True, "method": "zeros"}, + train_data=intersect_0.outputs["output_data"], + learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, + "total_iters": 100}}) + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="y", + runtime_roles=["guest"], + default_eval_setting="binary", + input_data=lr_0.outputs["train_output_data"]) + + pipeline.add_task(intersect_0) + pipeline.add_task(lr_0) + pipeline.add_task(evaluation_0) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + pipeline.deploy([intersect_0, lr_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + deployed_pipeline.intersect_0.guest.component_setting( + input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"{namespace}experiment_sid")) + deployed_pipeline.intersect_0.hosts[[0, 1]].component_setting( + input_data=DataWarehouseChannel(name="breast_hetero_host_sid", + namespace=f"{namespace}experiment_sid")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + # print(f"predict lr_0 data: {pipeline.get_task_info('lr_0').get_output_data()}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("-config", type=str, default="./config.yaml", + help="config file") + parser.add_argument("-namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/test_upload_sid.py b/examples/pipeline/test_upload_sid.py index 3d32f4757e..dfc82d1b18 100644 --- a/examples/pipeline/test_upload_sid.py +++ b/examples/pipeline/test_upload_sid.py @@ -31,10 +31,10 @@ 'tag_with_value': False, 'weight_type': 'float64'} -pipeline.transform_local_file_to_dataframe( # file="${abs_path_of_data_guest}", - meta=meta, head=True, extend_sid=False, - namespace="experiment", - name="breast_hetero_guest_sid") +pipeline.transform_local_file_to_dataframe("/Users/yuwu/PycharmProjects/FATE/examples/data/breast_hetero_guest_sid.csv", + meta=meta, head=True, extend_sid=False, + namespace="experiment_sid", + name="breast_hetero_guest") meta = {'delimiter': ',', 'dtype': 'float64', @@ -47,7 +47,7 @@ 'tag_with_value': False, 'weight_type': 'float64'} -pipeline.transform_local_file_to_dataframe( # file="${abs_path_of_data_guest}", - meta=meta, head=True, extend_sid=False, - namespace="experiment", - name="breast_hetero_host_sid") +pipeline.transform_local_file_to_dataframe("/Users/yuwu/PycharmProjects/FATE/examples/data/breast_hetero_host_sid.csv", + meta=meta, head=True, extend_sid=False, + namespace="experiment_sid", + name="breast_hetero_host") diff --git a/python/fate_test/__init__.py b/python/fate_test/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/fate_test/fate_test/__init__.py b/python/fate_test/fate_test/__init__.py new file mode 100644 index 0000000000..878d3a9c5d --- /dev/null +++ b/python/fate_test/fate_test/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/fate_test/fate_test/_ascii.py b/python/fate_test/fate_test/_ascii.py new file mode 100644 index 0000000000..ac3ba1244f --- /dev/null +++ b/python/fate_test/fate_test/_ascii.py @@ -0,0 +1,48 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +HEAD = """\ + +████████╗███████╗███████╗████████╗███████╗██╗ ██╗██╗████████╗███████╗ +╚══██╔══╝██╔════╝██╔════╝╚══██╔══╝██╔════╝██║ ██║██║╚══██╔══╝██╔════╝ + ██║ █████╗ ███████╗ ██║ ███████╗██║ ██║██║ ██║ █████╗ + ██║ ██╔══╝ ╚════██║ ██║ ╚════██║██║ ██║██║ ██║ ██╔══╝ + ██║ ███████╗███████║ ██║ ███████║╚██████╔╝██║ ██║ ███████╗ + ╚═╝ ╚══════╝╚══════╝ ╚═╝ ╚══════╝ ╚═════╝ ╚═╝ ╚═╝ ╚══════╝ + +""" + +BENCHMARK = """\ + +██████╗ ███████╗███╗ ██╗ ██████╗██╗ ██╗███╗ ███╗ █████╗ ██████╗ ██╗ ██╗ +██╔══██╗██╔════╝████╗ ██║██╔════╝██║ ██║████╗ ████║██╔══██╗██╔══██╗██║ ██╔╝ +██████╔╝█████╗ ██╔██╗ ██║██║ ███████║██╔████╔██║███████║██████╔╝█████╔╝ +██╔══██╗██╔══╝ ██║╚██╗██║██║ ██╔══██║██║╚██╔╝██║██╔══██║██╔══██╗██╔═██╗ +██████╔╝███████╗██║ ╚████║╚██████╗██║ ██║██║ ╚═╝ ██║██║ ██║██║ ██║██║ ██╗ +╚═════╝ ╚══════╝╚═╝ ╚═══╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ +""" + +TAIL = """\ + + ██╗ ██╗ █████╗ ██╗ ██╗███████╗ ███████╗██╗ ██╗███╗ ██╗ + ██║ ██║██╔══██╗██║ ██║██╔════╝ ██╔════╝██║ ██║████╗ ██║ + ███████║███████║██║ ██║█████╗ █████╗ ██║ ██║██╔██╗ ██║ + ██╔══██║██╔══██║╚██╗ ██╔╝██╔══╝ ██╔══╝ ██║ ██║██║╚██╗██║ + ██║ ██║██║ ██║ ╚████╔╝ ███████╗ ██║ ╚██████╔╝██║ ╚████║ + ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═══╝ ╚══════╝ ╚═╝ ╚═════╝ ╚═╝ ╚═══╝ + +""" diff --git a/python/fate_test/fate_test/_client.py b/python/fate_test/fate_test/_client.py new file mode 100644 index 0000000000..84d623c4c3 --- /dev/null +++ b/python/fate_test/fate_test/_client.py @@ -0,0 +1,76 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sshtunnel + +from fate_test._flow_client import FLOWClient +from fate_test._io import LOGGER +from fate_test._parser import Config + + +class Clients(object): + def __init__(self, config: Config): + self._flow_clients = {} + self._tunnel_id_to_flow_clients = {} + self._role_str_to_service_id = {} + self._tunnel_id_to_tunnel = config.tunnel_id_to_tunnel + + for service_id, service in config.service_id_to_service.items(): + if isinstance(service, Config.service): + self._flow_clients[service_id] = FLOWClient( + service.address, config.data_base_dir, config.cache_directory) + + elif isinstance(service, Config.tunnel_service): + self._flow_clients[service_id] = FLOWClient(None, config.data_base_dir, config.cache_directory) + self._tunnel_id_to_flow_clients.setdefault(service.tunnel_id, []).append( + (service.index, self._flow_clients[service_id])) + + for party, service_id in config.party_to_service_id.items(): + for role_str in config.parties.party_to_role_string(party): + self._role_str_to_service_id[role_str] = service_id + + def __getitem__(self, role_str: str) -> 'FLOWClient': + if role_str not in self._role_str_to_service_id: + raise RuntimeError(f"no flow client found binding to {role_str}") + return self._flow_clients[self._role_str_to_service_id[role_str]] + + def __enter__(self): + # open ssh tunnels and create flow clients for remote + self._tunnels = [] + for tunnel_id, tunnel_conf in self._tunnel_id_to_tunnel.items(): + tunnel = sshtunnel.SSHTunnelForwarder(ssh_address_or_host=tunnel_conf.ssh_address, + ssh_username=tunnel_conf.ssh_username, + ssh_password=tunnel_conf.ssh_password, + ssh_pkey=tunnel_conf.ssh_priv_key, + remote_bind_addresses=tunnel_conf.services_address) + tunnel.start() + self._tunnels.append(tunnel) + for index, flow_client in self._tunnel_id_to_flow_clients[tunnel_id]: + flow_client.set_address(f"127.0.0.1:{tunnel.local_bind_ports[index]}") + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + for tunnel in self._tunnels: + try: + tunnel.stop() + except Exception as e: + LOGGER.exception(e) + + def contains(self, role_str): + return role_str in self._role_str_to_service_id + + def all_roles(self): + return sorted(self._role_str_to_service_id.keys()) diff --git a/python/fate_test/fate_test/_config.py b/python/fate_test/fate_test/_config.py new file mode 100644 index 0000000000..7b26b69c3c --- /dev/null +++ b/python/fate_test/fate_test/_config.py @@ -0,0 +1,269 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import os +import typing +from collections import namedtuple +from pathlib import Path + +from ruamel import yaml + +template = """\ +# base dir for data upload conf eg, data_base_dir={FATE} +# also used for accessing local files when running standalone mode +# examples/data/breast_hetero_guest.csv -> $data_base_dir/examples/data/breast_hetero_guest.csv +data_base_dir: path(FATE) + +# directory dedicated to fate_test job file storage, default cache location={FATE}/examples/cache/ +cache_directory: examples/cache/ +# directory stores performance benchmark suites, default location={FATE}/examples/benchmark_performance +performance_template_directory: examples/benchmark_performance/ +# directory stores flow test config, default location={FATE}/examples/flow_test_template/hetero_lr/flow_test_config.yaml +# st_config_directory: examples/flow_test_template/hetero_lr/flow_test_config.yaml + +# directory stores testsuite file with min_test data sets to upload, +# default location={FATE}/examples/data/upload_config/min_test_data_testsuite.json +min_test_data_config: examples/data/upload_config/min_test_data_testsuite.json +# directory stores testsuite file with all example data sets to upload, +# default location={FATE}/examples/data/upload_config/all_examples_data_testsuite.json +all_examples_data_config: examples/data/upload_config/all_examples_data_testsuite.json + +# directory where FATE code locates, default installation location={FATE}/fate +# python/federatedml -> $fate_base/python/federatedml +fate_base: path(FATE)/fate + +# whether to delete data in suites after all jobs done +clean_data: true + +# work mode: 0 for standalone, 1 for cluster +work_mode: 0 + +# participating parties' id and correponding flow service ip & port information +parties: + guest: [9999] + host: [10000, 9999] + arbiter: [10000] +services: + - flow_services: + - {address: 127.0.0.1:9380, parties: [9999, 10000]} + serving_setting: + address: 127.0.0.1:8059 + + ssh_tunnel: # optional + enable: false + ssh_address: : + ssh_username: + ssh_password: # optional + ssh_priv_key: "~/.ssh/id_rsa" + + +# what is ssh_tunnel? +# to open the ssh tunnel(s) if the remote service +# cannot be accessed directly from the location where the test suite is run! +# +# +---------------------+ +# | ssh address | +# | ssh username | +# | ssh password/ | +# +--------+ | ssh priv_key | +----------------+ +# |local ip+----------ssh tuunel-------------->+remote local ip | +# +--------+ | | +----------------+ +# | | +# request local ip:port +----- as if --------->request remote's local ip:port from remote side +# | | +# | | +# +---------------------+ +# + +""" + +data_base_dir = Path(__file__).resolve().parents[3] +if (data_base_dir / 'examples').is_dir(): + template = template.replace('path(FATE)', str(data_base_dir)) + +_default_config = Path(__file__).resolve().parent / 'fate_test_config.yaml' + +data_switch = None +use_local_data = 1 +data_alter = dict() +deps_alter = dict() +jobs_num = 0 +jobs_progress = 0 +non_success_jobs = [] + + +def create_config(path: Path, override=False): + if path.exists() and not override: + raise FileExistsError(f"{path} exists") + + with path.open("w") as f: + f.write(template) + + +def default_config(): + if not _default_config.exists(): + create_config(_default_config) + return _default_config + + +class Parties(object): + def __init__(self, **kwargs): + """ + mostly, accept guest, host and arbiter + """ + self._role_to_parties = kwargs + + self._party_to_role_string = {} + for role in kwargs: + parties = kwargs[role] + setattr(self, role, parties) + for i, party in enumerate(parties): + if party not in self._party_to_role_string: + self._party_to_role_string[party] = set() + self._party_to_role_string[party].add(f"{role.lower()}_{i}") + + @staticmethod + def from_dict(d: typing.MutableMapping[str, typing.List[int]]): + return Parties(**d) + + def party_to_role_string(self, party): + return self._party_to_role_string[party] + + def extract_role(self, counts: typing.MutableMapping[str, int]): + roles = {} + for role, num in counts.items(): + if role not in self._role_to_parties and num > 0: + raise ValueError(f"{role} not found in config") + else: + if len(self._role_to_parties[role]) < num: + raise ValueError(f"require {num} {role} parties, only {len(self._role_to_parties[role])} in config") + roles[role] = self._role_to_parties[role][:num] + return roles + + def extract_initiator_role(self, role): + initiator_role = role.strip() + if len(self._role_to_parties[initiator_role]) < 1: + raise ValueError(f"role {initiator_role} has empty party list") + party_id = self._role_to_parties[initiator_role][0] + return dict(role=initiator_role, party_id=party_id) + + +class Config(object): + service = namedtuple("service", ["address"]) + tunnel_service = namedtuple("tunnel_service", ["tunnel_id", "index"]) + tunnel = namedtuple("tunnel", ["ssh_address", "ssh_username", "ssh_password", "ssh_priv_key", "services_address"]) + + def __init__(self, config): + self.data_base_dir = config["data_base_dir"] + self.cache_directory = os.path.join(config["data_base_dir"], config["cache_directory"]) + self.perf_template_dir = os.path.join(config["data_base_dir"], config["performance_template_directory"]) + # self.flow_test_config_dir = os.path.join(config["data_base_dir"], config["flow_test_config_directory"]) + self.min_test_data_config = os.path.join(config["data_base_dir"], config["min_test_data_config"]) + self.all_examples_data_config = os.path.join(config["data_base_dir"], config["all_examples_data_config"]) + self.fate_base = config["fate_base"] + self.clean_data = config.get("clean_data", True) + self.parties = Parties.from_dict(config["parties"]) + self.role = config["parties"] + self.serving_setting = config["services"][0] + self.party_to_service_id = {} + self.service_id_to_service = {} + self.tunnel_id_to_tunnel = {} + self.extend_sid = None + self.auto_increasing_sid = None + self.work_mode = config.get("work_mode", 0) + + tunnel_id = 0 + service_id = 0 + os.makedirs(os.path.dirname(self.cache_directory), exist_ok=True) + for service_config in config["services"]: + flow_services = service_config["flow_services"] + if service_config.get("ssh_tunnel", {}).get("enable", False): + tunnel_id += 1 + services_address = [] + for index, flow_service in enumerate(flow_services): + service_id += 1 + address_host, address_port = flow_service["address"].split(":") + address_port = int(address_port) + services_address.append((address_host, address_port)) + self.service_id_to_service[service_id] = self.tunnel_service(tunnel_id, index) + for party in flow_service["parties"]: + self.party_to_service_id[party] = service_id + tunnel_config = service_config["ssh_tunnel"] + ssh_address_host, ssh_address_port = tunnel_config["ssh_address"].split(":") + self.tunnel_id_to_tunnel[tunnel_id] = self.tunnel((ssh_address_host, int(ssh_address_port)), + tunnel_config["ssh_username"], + tunnel_config["ssh_password"], + tunnel_config["ssh_priv_key"], + services_address) + else: + for flow_service in flow_services: + service_id += 1 + address = flow_service["address"] + self.service_id_to_service[service_id] = self.service(address) + for party in flow_service["parties"]: + self.party_to_service_id[party] = service_id + + @staticmethod + def load(path: typing.Union[str, Path], **kwargs): + if isinstance(path, str): + path = Path(path) + config = {} + if path is not None: + with path.open("r") as f: + config.update(yaml.safe_load(f)) + + if config["data_base_dir"] == "path(FATE)": + raise ValueError("Invalid 'data_base_dir'.") + config["data_base_dir"] = path.resolve().joinpath(config["data_base_dir"]).resolve() + + config.update(kwargs) + return Config(config) + + @staticmethod + def load_from_file(path: typing.Union[str, Path]): + """ + Loads conf content from json or yaml file. Used to read in parameter configuration + Parameters + ---------- + path: str, path to conf file, should be absolute path + + Returns + ------- + dict, parameter configuration in dictionary format + + """ + if isinstance(path, str): + path = Path(path) + config = {} + if path is not None: + file_type = path.suffix + with path.open("r") as f: + if file_type == ".yaml": + config.update(yaml.safe_load(f)) + elif file_type == ".json": + config.update(json.load(f)) + else: + raise ValueError(f"Cannot load conf from file type {file_type}") + return config + + +def parse_config(config): + try: + config_inst = Config.load(config) + except Exception as e: + raise RuntimeError(f"error parse config from {config}") from e + return config_inst diff --git a/python/fate_test/fate_test/_flow_client.py b/python/fate_test/fate_test/_flow_client.py new file mode 100644 index 0000000000..0cfafb9d8e --- /dev/null +++ b/python/fate_test/fate_test/_flow_client.py @@ -0,0 +1,376 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import time +import typing +from datetime import timedelta +from pathlib import Path + +from fate_client.flow_sdk import FlowClient +from fate_test._parser import Data + +from fate_test import _config + + +class FLOWClient(object): + + def __init__(self, + address: typing.Optional[str], + data_base_dir: typing.Optional[Path], + cache_directory: typing.Optional[Path], + role: str, + party_id: int): + self.address = address + self.version = "2.0.0-beta" + self._client = FlowClient(self.address.split(':')[0], self.address.split(':')[1], self.version) + self._data_base_dir = data_base_dir + self._cache_directory = cache_directory + self.data_size = 0 + self.role = role + self.party_id = party_id + + def set_address(self, address): + self.address = address + + def upload_data(self, data: Data, callback=None, output_path=None): + response = self._upload_data(data, output_path=output_path) + try: + code = response["code"] + if code != 0: + raise ValueError(f"Return code {code}!=0") + + namespace = response["data"]["namespace"] + name = response["data"]["name"] + job_id = response["job_id"] + except BaseException: + raise ValueError(f"Upload data fails, response={response}") + + # self.monitor_status(job_id, role=self.role, party_id=self.party_id) + self._awaiting(job_id, self.role, self.party_id, ) + return dict(namespace=namespace, name=name) + + def delete_data(self, data: Data): + # @todo: use client.table.delete(table=, namespace=) + try: + table_name = data.config['table_name'] if data.config.get( + 'table_name', None) is not None else data.config.get('name') + self._delete_data(table_name=table_name, namespace=data.config['namespace']) + except Exception as e: + raise RuntimeError(f"delete data failed") from e + + def output_data_table(self, job_id, role, party_id, component_name): + result = self._output_data_table(job_id=job_id, role=role, party_id=party_id, component_name=component_name) + return result + + def table_info(self, table_name, namespace): + result = self._table_info(table_name=table_name, namespace=namespace) + return result + + def add_notes(self, job_id, role, party_id, notes): + self._add_notes(job_id=job_id, role=role, party_id=party_id, notes=notes) + + """def check_connection(self): + try: + version = self._http.request(method="POST", url=f"{self._base}version/get", json={"module": "FATE"}, + timeout=2).json() + except Exception: + import traceback + traceback.print_exc() + raise + fate_version = version.get("data", {}).get("FATE") + if fate_version: + return fate_version, self.address + + raise EnvironmentError(f"connection not ok")""" + + def _awaiting(self, job_id, role, party_id, callback=None): + while True: + response = self._query_job(job_id, role=role, party_id=party_id) + if response.status.is_done(): + return response.status + if callback is not None: + callback(response) + time.sleep(1) + + def _upload_data(self, data, output_path=None, verbose=0, destroy=1): + conf = data.conf + # if conf.get("engine", {}) != "PATH": + if output_path is not None: + conf['file'] = os.path.join(os.path.abspath(output_path), os.path.basename(conf.get('file'))) + else: + if _config.data_switch is not None: + conf['file'] = os.path.join(str(self._cache_directory), os.path.basename(conf.get('file'))) + else: + conf['file'] = os.path.join(str(self._data_base_dir), conf.get('file')) + path = Path(conf.get('file')) + if not path.exists(): + raise Exception('The file is obtained from the fate flow client machine, but it does not exist, ' + f'please check the path: {path}') + response = self._client.data.upload(file=data.file, + head=data.head, + meta=data.meta, + extend_sid=data.extend_sid, + partitions=data.partitions) + return response + + def _table_info(self, table_name, namespace): + param = { + 'table_name': table_name, + 'namespace': namespace + } + response = self.flow_client(request='table/info', param=param) + return response + + def _delete_data(self, table_name, namespace): + param = { + 'table_name': table_name, + 'namespace': namespace + } + response = self.flow_client(request='table/delete', param=param) + return response + + def _submit_job(self, conf, dsl): + param = { + 'job_dsl': self._save_json(dsl, 'submit_dsl.json'), + 'job_runtime_conf': self._save_json(conf, 'submit_conf.json') + } + response = SubmitJobResponse(self.flow_client(request='job/submit', param=param)) + return response + + def _deploy_model(self, model_id, model_version, dsl=None): + post_data = {'model_id': model_id, + 'model_version': model_version, + 'predict_dsl': dsl} + response = self.flow_client(request='model/deploy', param=post_data) + result = {} + try: + retcode = response['retcode'] + retmsg = response['retmsg'] + if retcode != 0 or retmsg != 'success': + raise RuntimeError(f"deploy model error: {response}") + result["model_id"] = response["data"]["model_id"] + result["model_version"] = response["data"]["model_version"] + except Exception as e: + raise RuntimeError(f"deploy model error: {response}") from e + + return result + + def _output_data_table(self, job_id, role, party_id, component_name): + post_data = {'job_id': job_id, + 'role': role, + 'party_id': party_id, + 'component_name': component_name} + response = self.flow_client(request='component/output_data_table', param=post_data) + result = {} + try: + retcode = response['retcode'] + retmsg = response['retmsg'] + if retcode != 0 or retmsg != 'success': + raise RuntimeError(f"deploy model error: {response}") + result["name"] = response["data"][0]["table_name"] + result["namespace"] = response["data"][0]["table_namespace"] + except Exception as e: + raise RuntimeError(f"output data table error: {response}") from e + return result + + def _get_summary(self, job_id, role, party_id, component_name): + post_data = {'job_id': job_id, + 'role': role, + 'party_id': party_id, + 'component_name': component_name} + response = self.flow_client(request='component/get_summary', param=post_data) + try: + retcode = response['retcode'] + retmsg = response['retmsg'] + result = {} + if retcode != 0 or retmsg != 'success': + raise RuntimeError(f"deploy model error: {response}") + result["summary_dir"] = retmsg # 获取summary文件位置 + except Exception as e: + raise RuntimeError(f"output data table error: {response}") from e + return result + + """def _query_job(self, job_id, role): + param = { + 'job_id': job_id, + 'role': role + } + response = QueryJobResponse(self.flow_client(request='job/query', param=param)) + return response""" + + def _query_job(self, job_id, role, party_id): + response = self._client.job.query(job_id, role, party_id) + try: + code = response["code"] + if code != 0: + raise ValueError(f"Return code {code}!=0") + + data = response["data"][0] + return data + except BaseException: + raise ValueError(f"query job is failed, response={response}") + + def get_version(self): + response = self._post(url='version/get', json={"module": "FATE"}) + try: + retcode = response['retcode'] + retmsg = response['retmsg'] + if retcode != 0 or retmsg != 'success': + raise RuntimeError(f"get version error: {response}") + fate_version = response["data"]["FATE"] + except Exception as e: + raise RuntimeError(f"get version error: {response}") from e + return fate_version + + def _add_notes(self, job_id, role, party_id, notes): + data = dict(job_id=job_id, role=role, party_id=party_id, notes=notes) + response = AddNotesResponse(self._post(url='job/update', json=data)) + return response + + def _table_bind(self, data): + response = self._post(url='table/bind', json=data) + try: + retcode = response['retcode'] + retmsg = response['retmsg'] + if retcode != 0 or retmsg != 'success': + raise RuntimeError(f"table bind error: {response}") + except Exception as e: + raise RuntimeError(f"table bind error: {response}") from e + return response + + +class Status(object): + def __init__(self, status: str): + self.status = status + + def is_done(self): + return self.status.lower() in ['complete', 'success', 'canceled', 'failed', "timeout"] + + def is_success(self): + return self.status.lower() in ['complete', 'success'] + + def __str__(self): + return self.status + + def __repr__(self): + return self.__str__() + + +"""class QueryJobResponse(object): + def __init__(self, response: dict): + try: + status = Status(response.get('data')[0]["f_status"]) + progress = response.get('data')[0]['f_progress'] + except Exception as e: + raise RuntimeError(f"query job error, response: {response}") from e + self.status = status + self.progress = progress""" + + +class UploadDataResponse(object): + def __init__(self, response: dict): + try: + self.job_id = response["jobId"] + except Exception as e: + raise RuntimeError(f"upload error, response: {response}") from e + self.status: typing.Optional[Status] = None + + +class AddNotesResponse(object): + def __init__(self, response: dict): + try: + retcode = response['retcode'] + retmsg = response['retmsg'] + if retcode != 0 or retmsg != 'success': + raise RuntimeError(f"add notes error: {response}") + except Exception as e: + raise RuntimeError(f"add notes error: {response}") from e + + +"""class SubmitJobResponse(object): + def __init__(self, response: dict): + try: + self.job_id = response["jobId"] + self.model_info = response["data"]["model_info"] + except Exception as e: + raise RuntimeError(f"submit job error, response: {response}") from e + self.status: typing.Optional[Status] = None +""" + + +class DataProgress(object): + def __init__(self, role_str): + self.role_str = role_str + self.start = time.time() + self.show_str = f"[{self.elapse()}] {self.role_str}" + self.job_id = "" + + def elapse(self): + return f"{timedelta(seconds=int(time.time() - self.start))}" + + def submitted(self, job_id): + self.job_id = job_id + self.show_str = f"[{self.elapse()}]{self.job_id} {self.role_str}" + + def update(self): + self.show_str = f"[{self.elapse()}]{self.job_id} {self.role_str}" + + def show(self): + return self.show_str + + +class JobProgress(object): + def __init__(self, name): + self.name = name + self.start = time.time() + self.show_str = f"[{self.elapse()}] {self.name}" + self.job_id = "" + self.progress_tracking = "" + + def elapse(self): + return f"{timedelta(seconds=int(time.time() - self.start))}" + + def set_progress_tracking(self, progress_tracking): + self.progress_tracking = progress_tracking + " " + + def submitted(self, job_id): + self.job_id = job_id + self.show_str = f"{self.progress_tracking}[{self.elapse()}]{self.job_id} submitted {self.name}" + + def running(self, status, progress): + if progress is None: + progress = 0 + self.show_str = f"{self.progress_tracking}[{self.elapse()}]{self.job_id} {status} {progress:3}% {self.name}" + + def exception(self, exception_id): + self.show_str = f"{self.progress_tracking}[{self.elapse()}]{self.name} exception({exception_id}): {self.job_id}" + + def final(self, status): + self.show_str = f"{self.progress_tracking}[{self.elapse()}]{self.job_id} {status} {self.name}" + + def show(self): + return self.show_str + + +class JobStatus(object): + WAITING = 'waiting' + READY = 'ready' + RUNNING = "running" + CANCELED = "canceled" + TIMEOUT = "timeout" + FAILED = "failed" + PASS = "pass" + SUCCESS = "success" diff --git a/python/fate_test/fate_test/_io.py b/python/fate_test/fate_test/_io.py new file mode 100644 index 0000000000..edfaeee964 --- /dev/null +++ b/python/fate_test/fate_test/_io.py @@ -0,0 +1,70 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import click +import loguru + +from fate_test._ascii import HEAD, TAIL, BENCHMARK + + +# noinspection PyPep8Naming +class echo(object): + _file = None + + @classmethod + def set_file(cls, file): + cls._file = file + + @classmethod + def echo(cls, message, **kwargs): + click.secho(message, **kwargs) + click.secho(message, file=cls._file, **kwargs) + + @classmethod + def file(cls, message, **kwargs): + click.secho(message, file=cls._file, **kwargs) + + @classmethod + def stdout(cls, message, **kwargs): + click.secho(message, **kwargs) + + @classmethod + def stdout_newline(cls): + click.secho("") + + @classmethod + def welcome(cls, banner_type="testsuite"): + if banner_type == "testsuite": + cls.echo(HEAD) + elif banner_type == "benchmark": + cls.echo(BENCHMARK) + + @classmethod + def farewell(cls): + cls.echo(TAIL) + + @classmethod + def flush(cls): + import sys + sys.stdout.flush() + + +def set_logger(name): + loguru.logger.remove() + loguru.logger.add(name, level='ERROR', delay=True) + return loguru.logger + + +LOGGER = loguru.logger diff --git a/python/fate_test/fate_test/_parser.py b/python/fate_test/fate_test/_parser.py new file mode 100644 index 0000000000..3ab001da29 --- /dev/null +++ b/python/fate_test/fate_test/_parser.py @@ -0,0 +1,587 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import typing +from collections import deque +from pathlib import Path + +import click +import prettytable +from fate_test._config import Parties, Config +from fate_test._io import echo +from fate_test.utils import TxtStyle + +from fate_test import _config + + +# noinspection PyPep8Naming +class chain_hook(object): + def __init__(self): + self._hooks = [] + + def add_hook(self, hook): + self._hooks.append(hook) + return self + + def add_extend_namespace_hook(self, namespace): + self.add_hook(_namespace_hook(namespace)) + return self + + def add_replace_hook(self, mapping): + self.add_hook(_replace_hook(mapping)) + + def hook(self, d): + return self._chain_hooks(self._hooks, d) + + @staticmethod + def _chain_hooks(hook_funcs, d): + for hook_func in hook_funcs: + if d is None: + return + d = hook_func(d) + return d + + +DATA_JSON_HOOK = chain_hook() +CONF_JSON_HOOK = chain_hook() +DSL_JSON_HOOK = chain_hook() + + +class Data(object): + def __init__(self, config: dict, role_str: str): + self.config = config + self.file = config.get("file", "") + self.meta = config.get("meta", {}) + self.partitions = config.get("partitions", 4) + self.head = config.get("head", True) + self.extend_sid = config.get("extend_sid", True) + self.namespace = config.get("namespace", "") + self.table_name = config.get("table_name", "") + self.role_str = role_str + + @staticmethod + def load(config, path: Path): + kwargs = {} + for field_name in config.keys(): + if field_name not in ["file", "role"]: + kwargs[field_name] = config[field_name] + # if config.get("engine", {}) != "PATH": + file_path = path.parent.joinpath(config["file"]).resolve() + if not file_path.exists(): + kwargs["file"] = config["file"] + else: + kwargs["file"] = file_path + role_str = config.get("role") if config.get("role") != "guest" else "guest_0" + return Data(config=kwargs, role_str=role_str) + + def update(self, config: Config): + if config.extend_sid is not None: + self.extend_sid = config.extend_sid + if config.meta is not None: + self.meta.update(config.meta) + + +class JobConf(object): + def __init__(self, initiator: dict, role: dict, job_parameters=None, **kwargs): + self.initiator = initiator + self.role = role + self.job_parameters = job_parameters if job_parameters else {} + self.others_kwargs = kwargs + + def as_dict(self): + return dict( + initiator=self.initiator, + role=self.role, + job_parameters=self.job_parameters, + **self.others_kwargs, + ) + + @staticmethod + def load(path: Path): + with path.open("r") as f: + kwargs = json.load(f, object_hook=CONF_JSON_HOOK.hook) + return JobConf(**kwargs) + + @property + def dsl_version(self): + return self.others_kwargs.get("dsl_version", 1) + + def update( + self, + parties: Parties, + timeout, + job_parameters, + component_parameters, + ): + self.initiator = parties.extract_initiator_role(self.initiator["role"]) + self.role = parties.extract_role( + {role: len(parties) for role, parties in self.role.items()} + ) + if timeout > 0: + self.update_job_common_parameters(timeout=timeout) + + if timeout > 0: + self.update_job_common_parameters(timeout=timeout) + + for key, value in job_parameters.items(): + self.update_parameters(parameters=self.job_parameters, key=key, value=value) + for key, value in component_parameters.items(): + if self.dsl_version == 1: + self.update_parameters( + parameters=self.others_kwargs.get("algorithm_parameters"), + key=key, + value=value, + ) + else: + self.update_parameters( + parameters=self.others_kwargs.get("component_parameters"), + key=key, + value=value, + ) + + def update_parameters(self, parameters, key, value): + if isinstance(parameters, dict): + for keys in parameters: + if keys == key: + parameters.get(key).update(value), + elif isinstance(parameters[keys], dict): + self.update_parameters(parameters[keys], key, value) + + def update_job_common_parameters(self, **kwargs): + if self.dsl_version == 1: + self.job_parameters.update(**kwargs) + else: + self.job_parameters.setdefault("common", {}).update(**kwargs) + + def update_job_type(self, job_type="predict"): + if self.dsl_version == 1: + if self.job_parameters.get("job_type", None) is None: + self.job_parameters.update({"job_type": job_type}) + else: + if self.job_parameters.setdefault("common", {}).get("job_type", None) is None: + self.job_parameters.setdefault("common", {}).update({"job_type": job_type}) + + def update_component_parameters(self, key, value, parameters=None): + if parameters is None: + if self.dsl_version == 1: + parameters = self.others_kwargs.get("algorithm_parameters") + else: + parameters = self.others_kwargs.get("component_parameters") + if isinstance(parameters, dict): + for keys in parameters: + if keys == key: + if isinstance(value, dict): + parameters[keys].update(value) + else: + parameters.update({key: value}) + elif ( + isinstance(parameters[keys], dict) and parameters[keys] is not None + ): + self.update_component_parameters(key, value, parameters[keys]) + + def get_component_parameters(self, keys): + if len(keys) == 0: + return self.others_kwargs.get("component_parameters") if self.dsl_version == 2 else self.others_kwargs.get( + "role_parameters") + if self.dsl_version == 1: + parameters = self.others_kwargs.get("role_parameters") + else: + parameters = self.others_kwargs.get("component_parameters").get("role") + + for key in keys: + parameters = parameters[key] + return parameters + + +class JobDSL(object): + def __init__(self, components: dict, provider=None): + self.components = components + self.provider = provider + + @staticmethod + def load(path: Path, provider): + with path.open("r") as f: + kwargs = json.load(f, object_hook=DSL_JSON_HOOK.hook) + if provider is not None: + kwargs["provider"] = provider + return JobDSL(**kwargs) + + def as_dict(self): + if self.provider is None: + return dict(components=self.components) + else: + return dict(components=self.components, provider=self.provider) + + +class Job(object): + def __init__( + self, + job_name: str, + job_conf: JobConf, + job_dsl: typing.Optional[JobDSL], + pre_works: list, + ): + self.job_name = job_name + self.job_conf = job_conf + self.job_dsl = job_dsl + self.pre_works = pre_works + + @classmethod + def load(cls, job_name, job_configs, base: Path, provider): + job_conf = JobConf.load(base.joinpath(job_configs.get("conf")).resolve()) + job_dsl = job_configs.get("dsl", None) + if job_dsl is not None: + job_dsl = JobDSL.load(base.joinpath(job_dsl).resolve(), provider) + + pre_works = [] + pre_works_value = {} + deps_dict = {} + + if job_configs.get("model_deps", None): + pre_works.append(job_configs["model_deps"]) + deps_dict["model_deps"] = {'name': job_configs["model_deps"]} + elif job_configs.get("deps", None): + pre_works.append(job_configs["deps"]) + deps_dict["model_deps"] = {'name': job_configs["deps"]} + if job_configs.get("data_deps", None): + deps_dict["data_deps"] = {'data': job_configs["data_deps"]} + pre_works.append(list(job_configs["data_deps"].keys())[0]) + deps_dict["data_deps"].update({'name': list(job_configs["data_deps"].keys())}) + if job_configs.get("cache_deps", None): + pre_works.append(job_configs["cache_deps"]) + deps_dict["cache_deps"] = {'name': job_configs["cache_deps"]} + if job_configs.get("model_loader_deps", None): + pre_works.append(job_configs["model_loader_deps"]) + deps_dict["model_loader_deps"] = {'name': job_configs["model_loader_deps"]} + + pre_works_value.update(deps_dict) + _config.deps_alter[job_name] = pre_works_value + + return Job( + job_name=job_name, job_conf=job_conf, job_dsl=job_dsl, pre_works=pre_works + ) + + @property + def submit_params(self): + return dict( + conf=self.job_conf.as_dict(), + dsl=self.job_dsl.as_dict() if self.job_dsl else None, + ) + + def set_pre_work(self, name, **kwargs): + self.job_conf.update_job_common_parameters(**kwargs) + self.job_conf.update_job_type("predict") + + def set_input_data(self, hierarchys, table_info): + for table_name, hierarchy in zip(table_info, hierarchys): + key = list(table_name.keys())[0] + value = table_name[key] + self.job_conf.update_component_parameters( + key=key, + value=value, + parameters=self.job_conf.get_component_parameters(hierarchy), + ) + + def is_submit_ready(self): + return len(self.pre_works) == 0 + + +class PipelineJob(object): + def __init__(self, job_name: str, script_path: Path): + self.job_name = job_name + self.script_path = script_path + + +class Testsuite(object): + def __init__( + self, + dataset: typing.List[Data], + # jobs: typing.List[Job], + pipeline_jobs: typing.List[PipelineJob], + path: Path, + ): + self.dataset = dataset + # self.jobs = jobs + self.pipeline_jobs = pipeline_jobs + self.path = path + self.suite_name = Path(self.path).stem + + self._dependency: typing.MutableMapping[str, typing.List[Job]] = {} + self._final_status: typing.MutableMapping[str, FinalStatus] = {} + self._ready_jobs = deque() + """for job in self.jobs: + for name in job.pre_works: + self._dependency.setdefault(name, []).append(job) + + self._final_status[job.job_name] = FinalStatus(job.job_name) + if job.is_submit_ready(): + self._ready_jobs.appendleft(job)""" + + for job in self.pipeline_jobs: + self._final_status[job.job_name] = FinalStatus(job.job_name) + + @staticmethod + def load(path: Path, provider): + with path.open("r") as f: + testsuite_config = json.load(f, object_hook=DATA_JSON_HOOK.hook) + + dataset = [] + for d in testsuite_config.get("data"): + if "use_local_data" not in d: + d.update({"use_local_data": _config.use_local_data}) + dataset.append(Data.load(d, path)) + """jobs = [] + for job_name, job_configs in testsuite_config.get("tasks", {}).items(): + jobs.append( + Job.load(job_name=job_name, job_configs=job_configs, base=path.parent, provider=provider) + )""" + + pipeline_jobs = [] + if testsuite_config.get("tasks", None) is not None and provider is not None: + echo.echo('[Warning] Pipeline does not support parameter: provider-> {}'.format(provider)) + for job_name, job_configs in testsuite_config.get("tasks", {}).items(): + script_path = path.parent.joinpath(job_configs["script"]).resolve() + pipeline_jobs.append(PipelineJob(job_name, script_path)) + + testsuite = Testsuite(dataset, pipeline_jobs, path) + return testsuite + + def jobs_iter(self) -> typing.Generator[Job, None, None]: + while self._ready_jobs: + yield self._ready_jobs.pop() + + @staticmethod + def style_table(txt): + colored_txt = txt.replace("success", f"{TxtStyle.TRUE_VAL}success{TxtStyle.END}") + colored_txt = colored_txt.replace("failed", f"{TxtStyle.FALSE_VAL}failed{TxtStyle.END}") + colored_txt = colored_txt.replace("not submitted", f"{TxtStyle.FALSE_VAL}not submitted{TxtStyle.END}") + return colored_txt + + def pretty_final_summary(self, time_consuming, suite_file=None): + """table = prettytable.PrettyTable( + ["job_name", "job_id", "status", "time_consuming", "exception_id", "rest_dependency"] + )""" + table = prettytable.PrettyTable() + table.set_style(prettytable.ORGMODE) + field_names = ["job_name", "job_id", "status", "time_consuming", "exception_id", "rest_dependency"] + table.field_names = field_names + for status in self.get_final_status().values(): + if status.status != "success": + status.suite_file = suite_file + _config.non_success_jobs.append(status) + if status.exception_id != "-": + exception_id_txt = f"{TxtStyle.FALSE_VAL}{status.exception_id}{TxtStyle.END}" + else: + exception_id_txt = f"{TxtStyle.FIELD_VAL}{status.exception_id}{TxtStyle.END}" + table.add_row( + [ + f"{TxtStyle.FIELD_VAL}{status.name}{TxtStyle.END}", + f"{TxtStyle.FIELD_VAL}{status.job_id}{TxtStyle.END}", + self.style_table(status.status), + f"{TxtStyle.FIELD_VAL}{time_consuming.pop(0) if status.job_id != '-' else '-'}{TxtStyle.END}", + f"{exception_id_txt}", + f"{TxtStyle.FIELD_VAL}{','.join(status.rest_dependency)}{TxtStyle.END}", + ] + ) + + return table.get_string(title=f"{TxtStyle.TITLE}Testsuite Summary: {self.suite_name}{TxtStyle.END}") + + def model_in_dep(self, name): + return name in self._dependency + + def get_dependent_jobs(self, name): + return self._dependency[name] + + def remove_dependency(self, name): + del self._dependency[name] + + def feed_dep_info(self, job, name, model_info=None, table_info=None, cache_info=None, model_loader_info=None): + if model_info is not None: + job.set_pre_work(name, **model_info) + if table_info is not None: + job.set_input_data(table_info["hierarchy"], table_info["table_info"]) + if cache_info is not None: + job.set_input_data(cache_info["hierarchy"], cache_info["cache_info"]) + if model_loader_info is not None: + job.set_input_data(model_loader_info["hierarchy"], model_loader_info["model_loader_info"]) + if name in job.pre_works: + job.pre_works.remove(name) + if job.is_submit_ready(): + self._ready_jobs.appendleft(job) + + def reflash_configs(self, config: Config): + failed = [] + for job in self.jobs: + try: + job.job_conf.update( + config.parties, None, {}, {} + ) + except ValueError as e: + failed.append((job, e)) + return failed + + def update_status( + self, job_name, job_id: str = None, status: str = None, exception_id: str = None + ): + for k, v in locals().items(): + if k != "job_name" and v is not None: + setattr(self._final_status[job_name], k, v) + + def get_final_status(self): + for name, jobs in self._dependency.items(): + for job in jobs: + self._final_status[job.job_name].rest_dependency.append(name) + return self._final_status + + +class FinalStatus(object): + def __init__( + self, + name: str, + job_id: str = "-", + status: str = "not submitted", + exception_id: str = "-", + rest_dependency: typing.List[str] = None, + ): + self.name = name + self.job_id = job_id + self.status = status + self.exception_id = exception_id + self.rest_dependency = rest_dependency or [] + self.suite_file = None + + +class BenchmarkJob(object): + def __init__(self, job_name: str, script_path: Path, conf_path: Path): + self.job_name = job_name + self.script_path = script_path + self.conf_path = conf_path + + +class BenchmarkPair(object): + def __init__( + self, pair_name: str, jobs: typing.List[BenchmarkJob], compare_setting: dict + ): + self.pair_name = pair_name + self.jobs = jobs + self.compare_setting = compare_setting + + +class BenchmarkSuite(object): + def __init__( + self, dataset: typing.List[Data], pairs: typing.List[BenchmarkPair], path: Path + ): + self.dataset = dataset + self.pairs = pairs + self.path = path + + @staticmethod + def load(path: Path): + with path.open("r") as f: + testsuite_config = json.load(f, object_hook=DATA_JSON_HOOK.hook) + + dataset = [] + for d in testsuite_config.get("data"): + dataset.append(Data.load(d, path)) + + pairs = [] + for pair_name, pair_configs in testsuite_config.items(): + if pair_name == "data": + continue + jobs = [] + for job_name, job_configs in pair_configs.items(): + if job_name == "compare_setting": + continue + script_path = path.parent.joinpath(job_configs["script"]).resolve() + if job_configs.get("conf"): + conf_path = path.parent.joinpath(job_configs["conf"]).resolve() + else: + conf_path = "" + jobs.append( + BenchmarkJob( + job_name=job_name, script_path=script_path, conf_path=conf_path + ) + ) + compare_setting = pair_configs.get("compare_setting") + if compare_setting and not isinstance(compare_setting, dict): + raise ValueError( + f"expected 'compare_setting' type is dict, received {type(compare_setting)} instead." + ) + pairs.append( + BenchmarkPair( + pair_name=pair_name, jobs=jobs, compare_setting=compare_setting + ) + ) + suite = BenchmarkSuite(dataset=dataset, pairs=pairs, path=path) + return suite + + +def non_success_summary(): + status = {} + for job in _config.non_success_jobs: + if job.status not in status.keys(): + status[job.status] = prettytable.PrettyTable( + ["testsuite_name", "job_name", "job_id", "status", "exception_id", "rest_dependency"] + ) + + status[job.status].add_row( + [ + job.suite_file, + job.name, + job.job_id, + job.status, + job.exception_id, + ",".join(job.rest_dependency), + ] + ) + for k, v in status.items(): + echo.echo("\n" + "#" * 60) + echo.echo(v.get_string(title=f"{k} job record"), fg='red') + + +def _namespace_hook(namespace): + def _hook(d): + if d is None: + return d + if "namespace" in d and namespace: + d["namespace"] = f"{d['namespace']}_{namespace}" + return d + + return _hook + + +def _replace_hook(mapping: dict): + def _hook(d): + for k, v in mapping.items(): + if k in d: + d[k] = v + return d + + return _hook + + +class JsonParamType(click.ParamType): + name = "json_string" + + def convert(self, value, param, ctx): + try: + return json.loads(value) + except ValueError: + self.fail(f"{value} is not a valid json string", param, ctx) + + +JSON_STRING = JsonParamType() diff --git a/python/fate_test/fate_test/scripts/__init__.py b/python/fate_test/fate_test/scripts/__init__.py new file mode 100644 index 0000000000..878d3a9c5d --- /dev/null +++ b/python/fate_test/fate_test/scripts/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/fate_test/fate_test/scripts/_options.py b/python/fate_test/fate_test/scripts/_options.py new file mode 100644 index 0000000000..ae30f748a0 --- /dev/null +++ b/python/fate_test/fate_test/scripts/_options.py @@ -0,0 +1,67 @@ +import time + +import click +from fate_test._config import parse_config, default_config +from fate_test.scripts._utils import _set_namespace + + +class SharedOptions(object): + _options = { + "config": (('-c', '--config'), + dict(type=click.Path(exists=True), help=f"Manual specify config file", default=None), + default_config().__str__()), + "namespace": (('-n', '--namespace'), + dict(type=str, help=f"Manual specify fate_test namespace", default=None), + time.strftime('%Y%m%d%H%M%S')), + "namespace_mangling": (('-nm', '--namespace-mangling',), + dict(type=bool, is_flag=True, help="Mangling data namespace", default=None), + False), + "yes": (('-y', '--yes',), dict(type=bool, is_flag=True, help="Skip double check", default=None), + False), + # "extend_sid": (('--extend_sid',), + # dict(type=bool, is_flag=True, help="whether to append uuid as sid when uploading data", + # default=None), None), + # "auto_increasing_sid": (('--auto_increasing_sid',), + # dict(type=bool, is_flag=True, help="whether to generate sid value starting at 0", + # default=None), None), + # "mode": (('--mode',), dict(type=click.Choice(["cluster", "standalone"]), default="cluster", + # help="job mode, choose from 'cluster' or 'standalone'"), None) + } + + def __init__(self): + self._options_kwargs = {} + + def __getitem__(self, item): + return self._options_kwargs[item] + + def get(self, k, default=None): + v = self._options_kwargs.get(k, default) + if v is None and k in self._options: + v = self._options[k][2] + return v + + def update(self, **kwargs): + for k, v in kwargs.items(): + if v is not None: + self._options_kwargs[k] = v + + def post_process(self): + # add defaults here + for k, v in self._options.items(): + if self._options_kwargs.get(k, None) is None: + self._options_kwargs[k] = v[2] + + # update config + config = parse_config(self._options_kwargs['config']) + self._options_kwargs['config'] = config + + _set_namespace(self._options_kwargs['namespace_mangling'], self._options_kwargs['namespace']) + + @classmethod + def get_shared_options(cls, hidden=False): + def shared_options(f): + for name, option in cls._options.items(): + f = click.option(*option[0], **dict(option[1], hidden=hidden))(f) + return f + + return shared_options diff --git a/python/fate_test/fate_test/scripts/_utils.py b/python/fate_test/fate_test/scripts/_utils.py new file mode 100644 index 0000000000..c087300515 --- /dev/null +++ b/python/fate_test/fate_test/scripts/_utils.py @@ -0,0 +1,188 @@ +import glob as glob_ +import importlib +import os +import time +import uuid +from pathlib import Path + +import click +from fate_test._client import Clients +from fate_test._config import Config +from fate_test._flow_client import DataProgress, UploadDataResponse, QueryJobResponse +from fate_test._io import echo, LOGGER, set_logger +from fate_test._parser import Testsuite, BenchmarkSuite, DATA_JSON_HOOK, CONF_JSON_HOOK, DSL_JSON_HOOK + +from fate_test import _config + + +def _big_data_task(includes, guest_data_size, host_data_size, guest_feature_num, host_feature_num, host_data_type, + config_inst, encryption_type, match_rate, sparsity, force, split_host, output_path, parallelize): + from fate_test.scripts import generate_mock_data + + def _find_testsuite_files(path): + suffix = ["testsuite.json", "benchmark.json"] + if isinstance(path, str): + path = Path(path) + if path.is_file(): + if path.name.endswith(suffix[0]) or path.name.endswith(suffix[1]): + paths = [path] + else: + LOGGER.warning(f"{path} is file, but not end with `{suffix}`, skip") + paths = [] + return [p.resolve() for p in paths] + else: + os.path.abspath(path) + paths = glob_.glob(f"{path}/*{suffix[0]}") + glob_.glob(f"{path}/*{suffix[1]}") + return [Path(p) for p in paths] + + for include in includes: + if isinstance(include, str): + include_paths = Path(include) + include_paths = _find_testsuite_files(include_paths) + for include_path in include_paths: + generate_mock_data.get_big_data(guest_data_size, host_data_size, guest_feature_num, host_feature_num, + include_path, host_data_type, config_inst, encryption_type, + match_rate, sparsity, force, split_host, output_path, parallelize) + + +def _load_testsuites(includes, excludes, glob, provider=None, suffix="testsuite.json", suite_type="testsuite"): + def _find_testsuite_files(path): + if isinstance(path, str): + path = Path(path) + if path.is_file(): + if path.name.endswith(suffix): + paths = [path] + else: + LOGGER.warning(f"{path} is file, but not end with `{suffix}`, skip") + paths = [] + else: + paths = path.glob(f"**/*{suffix}") + return [p.resolve() for p in paths] + + excludes_set = set() + for exclude in excludes: + excludes_set.update(_find_testsuite_files(exclude)) + + suite_paths = set() + for include in includes: + if isinstance(include, str): + include = Path(include) + + # glob + if glob is not None and include.is_dir(): + include_list = include.glob(glob) + else: + include_list = [include] + for include_path in include_list: + for suite_path in _find_testsuite_files(include_path): + if suite_path not in excludes_set: + suite_paths.add(suite_path) + suites = [] + for suite_path in suite_paths: + try: + if suite_type == "testsuite": + suite = Testsuite.load(suite_path.resolve(), provider) + elif suite_type == "benchmark": + suite = BenchmarkSuite.load(suite_path.resolve()) + else: + raise ValueError(f"Unsupported suite type: {suite_type}. Only accept type 'testsuite' or 'benchmark'.") + except Exception as e: + echo.stdout(f"load suite {suite_path} failed: {e}") + else: + suites.append(suite) + return suites + + +@LOGGER.catch +def _upload_data(clients: Clients, suite, config: Config, output_path=None): + with click.progressbar(length=len(suite.dataset), + label="dataset", + show_eta=False, + show_pos=True, + width=24) as bar: + for i, data in enumerate(suite.dataset): + data.update(config) + table_name = data.config['table_name'] if data.config.get( + 'table_name', None) is not None else data.config.get('name') + data_progress = DataProgress(f"{data.role_str}<-{data.config['namespace']}.{table_name}") + + def update_bar(n_step): + bar.item_show_func = lambda x: data_progress.show() + time.sleep(0.1) + bar.update(n_step) + + def _call_back(resp): + if isinstance(resp, UploadDataResponse): + data_progress.submitted(resp.job_id) + echo.file(f"[dataset]{resp.job_id}") + if isinstance(resp, QueryJobResponse): + data_progress.update() + update_bar(0) + + try: + echo.stdout_newline() + status, data_path = clients[data.role_str].upload_data(data, _call_back, output_path) + time.sleep(1) + data_progress.update() + if status != 'success': + raise RuntimeError(f"uploading {i + 1}th data for {suite.path} {status}") + bar.update(1) + if _config.data_switch: + from fate_test.scripts import generate_mock_data + + generate_mock_data.remove_file(data_path) + except Exception: + exception_id = str(uuid.uuid1()) + echo.file(f"exception({exception_id})") + LOGGER.exception(f"exception id: {exception_id}") + echo.echo(f"upload {i + 1}th data {data.config} to {data.role_str} fail, exception_id: {exception_id}") + # raise RuntimeError(f"exception uploading {i + 1}th data") from e + + +def _delete_data(clients: Clients, suite: Testsuite): + with click.progressbar(length=len(suite.dataset), + label="delete ", + show_eta=False, + show_pos=True, + width=24) as bar: + for data in suite.dataset: + # noinspection PyBroadException + try: + table_name = data.config['table_name'] if data.config.get( + 'table_name', None) is not None else data.config.get('name') + bar.item_show_func = \ + lambda x: f"delete table: name={table_name}, namespace={data.config['namespace']}" + clients[data.role_str].delete_data(data) + except Exception: + LOGGER.exception( + f"delete failed: name={table_name}, namespace={data.config['namespace']}") + + time.sleep(0.5) + bar.update(1) + echo.stdout_newline() + + +def _load_module_from_script(script_path): + module_name = str(script_path).split("/", -1)[-1].split(".")[0] + loader = importlib.machinery.SourceFileLoader(module_name, str(script_path)) + spec = importlib.util.spec_from_loader(loader.name, loader) + mod = importlib.util.module_from_spec(spec) + loader.exec_module(mod) + return mod + + +def _set_namespace(data_namespace_mangling, namespace): + Path(f"logs/{namespace}").mkdir(exist_ok=True, parents=True) + set_logger(f"logs/{namespace}/exception.log") + echo.set_file(click.open_file(f'logs/{namespace}/stdout', "a")) + + if data_namespace_mangling: + echo.echo(f"add data_namespace_mangling: _{namespace}") + DATA_JSON_HOOK.add_extend_namespace_hook(namespace) + CONF_JSON_HOOK.add_extend_namespace_hook(namespace) + + +def _add_replace_hook(replace): + DATA_JSON_HOOK.add_replace_hook(replace) + CONF_JSON_HOOK.add_replace_hook(replace) + DSL_JSON_HOOK.add_replace_hook(replace) diff --git a/python/fate_test/fate_test/scripts/benchmark_cli.py b/python/fate_test/fate_test/scripts/benchmark_cli.py new file mode 100644 index 0000000000..9030ed9818 --- /dev/null +++ b/python/fate_test/fate_test/scripts/benchmark_cli.py @@ -0,0 +1,151 @@ +import os +import re +import time +import uuid +from datetime import timedelta +from inspect import signature + +import click +from fate_test._client import Clients +from fate_test._config import Config +from fate_test._io import LOGGER, echo +from fate_test._parser import BenchmarkSuite +from fate_test.scripts._options import SharedOptions +from fate_test.scripts._utils import _upload_data, _delete_data, _load_testsuites, _load_module_from_script +from fate_test.utils import show_data, match_metrics + +DATA_DISPLAY_PATTERN = re.compile("^FATE") + + +@click.command(name="benchmark-quality") +@click.option('-i', '--include', required=True, type=click.Path(exists=True), multiple=True, metavar="", + help="include *benchmark.json under these paths") +@click.option('-e', '--exclude', type=click.Path(exists=True), multiple=True, + help="exclude *benchmark.json under these paths") +@click.option('-g', '--glob', type=str, + help="glob string to filter sub-directory of path specified by ") +@click.option('-t', '--tol', type=float, + help="tolerance (absolute error) for metrics to be considered almost equal. " + "Comparison is done by evaluating abs(a-b) <= max(relative_tol * max(abs(a), abs(b)), absolute_tol)") +@click.option('-s', '--storage-tag', type=str, + help="tag for storing metrics, for future metrics info comparison") +@click.option('-v', '--history-tag', type=str, multiple=True, + help="Extract metrics info from history tags for comparison") +@click.option('-d', '--match-details', type=click.Choice(['all', 'relative', 'absolute', 'none']), + default="all", help="Error value display in algorithm comparison") +@click.option('--skip-data', is_flag=True, default=False, + help="skip uploading data specified in benchmark conf") +@click.option("--disable-clean-data", "clean_data", flag_value=False, default=None) +@click.option("--enable-clean-data", "clean_data", flag_value=True, default=None) +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, storage_tag, history_tag, match_details, + **kwargs): + """ + process benchmark suite, alias: bq + """ + ctx.obj.update(**kwargs) + ctx.obj.post_process() + namespace = ctx.obj["namespace"] + config_inst = ctx.obj["config"] + if ctx.obj["extend_sid"] is not None: + config_inst.extend_sid = ctx.obj["extend_sid"] + if ctx.obj["auto_increasing_sid"] is not None: + config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"] + if clean_data is None: + clean_data = config_inst.clean_data + data_namespace_mangling = ctx.obj["namespace_mangling"] + yes = ctx.obj["yes"] + + echo.welcome("benchmark") + echo.echo(f"testsuite namespace: {namespace}", fg='red') + echo.echo("loading testsuites:") + suites = _load_testsuites(includes=include, excludes=exclude, glob=glob, + suffix="benchmark.json", suite_type="benchmark") + for suite in suites: + echo.echo(f"\tdataset({len(suite.dataset)}) benchmark groups({len(suite.pairs)}) {suite.path}") + if not yes and not click.confirm("running?"): + return + with Clients(config_inst) as client: + fate_version = client["guest_0"].get_version() + for i, suite in enumerate(suites): + # noinspection PyBroadException + try: + start = time.time() + echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red') + if not skip_data: + try: + _upload_data(client, suite, config_inst) + except Exception as e: + raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e + try: + _run_benchmark_pairs(config_inst, suite, tol, namespace, data_namespace_mangling, storage_tag, + history_tag, fate_version, match_details) + except Exception as e: + raise RuntimeError(f"exception occur while running benchmark jobs for {suite.path}") from e + + if not skip_data and clean_data: + _delete_data(client, suite) + echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red') + + except Exception: + exception_id = uuid.uuid1() + echo.echo(f"exception in {suite.path}, exception_id={exception_id}", err=True, fg='red') + LOGGER.exception(f"exception id: {exception_id}") + finally: + echo.stdout_newline() + echo.farewell() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + + +@LOGGER.catch +def _run_benchmark_pairs(config: Config, suite: BenchmarkSuite, tol: float, namespace: str, + data_namespace_mangling: bool, storage_tag, history_tag, fate_version, match_details): + # pipeline demo goes here + pair_n = len(suite.pairs) + fate_base = config.fate_base + PYTHONPATH = os.environ.get('PYTHONPATH') + ":" + os.path.join(fate_base, "python") + os.environ['PYTHONPATH'] = PYTHONPATH + for i, pair in enumerate(suite.pairs): + echo.echo(f"Running [{i + 1}/{pair_n}] group: {pair.pair_name}") + results = {} + # data_summary = None + job_n = len(pair.jobs) + for j, job in enumerate(pair.jobs): + try: + echo.echo(f"Running [{j + 1}/{job_n}] job: {job.job_name}") + job_name, script_path, conf_path = job.job_name, job.script_path, job.conf_path + param = Config.load_from_file(conf_path) + mod = _load_module_from_script(script_path) + input_params = signature(mod.main).parameters + # local script + if len(input_params) == 1: + data, metric = mod.main(param=param) + elif len(input_params) == 2: + data, metric = mod.main(config=config, param=param) + # pipeline script + elif len(input_params) == 3: + if data_namespace_mangling: + data, metric = mod.main(config=config, param=param, namespace=f"_{namespace}") + else: + data, metric = mod.main(config=config, param=param) + else: + data, metric = mod.main() + results[job_name] = metric + echo.echo(f"[{j + 1}/{job_n}] job: {job.job_name} Success!\n") + if data and DATA_DISPLAY_PATTERN.match(job_name): + # data_summary = data + show_data(data) + # if data_summary is None: + # data_summary = data + except Exception as e: + exception_id = uuid.uuid1() + echo.echo(f"exception while running [{j + 1}/{job_n}] job, exception_id={exception_id}", err=True, + fg='red') + LOGGER.exception(f"exception id: {exception_id}, error message: \n{e}") + continue + rel_tol = pair.compare_setting.get("relative_tol") + # show_data(data_summary) + match_metrics(evaluate=True, group_name=pair.pair_name, abs_tol=tol, rel_tol=rel_tol, + storage_tag=storage_tag, history_tag=history_tag, fate_version=fate_version, + cache_directory=config.cache_directory, match_details=match_details, **results) diff --git a/python/fate_test/fate_test/scripts/cli.py b/python/fate_test/fate_test/scripts/cli.py new file mode 100644 index 0000000000..8dc444c7d8 --- /dev/null +++ b/python/fate_test/fate_test/scripts/cli.py @@ -0,0 +1,67 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import click + +from fate_test.scripts._options import SharedOptions +from fate_test.scripts.benchmark_cli import run_benchmark +from fate_test.scripts.config_cli import config_group +from fate_test.scripts.data_cli import data_group +# from fate_test.scripts.flow_test_cli import flow_group +from fate_test.scripts.performance_cli import run_task +from fate_test.scripts.quick_test_cli import unittest_group +# from fate_test.scripts.secure_protocol_cli import secure_protocol_group +from fate_test.scripts.testsuite_cli import run_suite + +commands = { + "config": config_group, + "suite": run_suite, + "performance": run_task, + "benchmark-quality": run_benchmark, + "data": data_group, + "unittest": unittest_group +} + +commands_alias = { + "bq": "benchmark-quality", + "bp": "performance" +} + + +class MultiCLI(click.MultiCommand): + + def list_commands(self, ctx): + return list(commands) + + def get_command(self, ctx, name): + if name not in commands and name in commands_alias: + name = commands_alias[name] + if name not in commands: + ctx.fail("No such command '{}'.".format(name)) + return commands[name] + + +@click.command(cls=MultiCLI, help="A collection of useful tools to running FATE's test.", + context_settings=dict(help_option_names=["-h", "--help"])) +@SharedOptions.get_shared_options() +@click.pass_context +def cli(ctx, **kwargs): + ctx.ensure_object(SharedOptions) + ctx.obj.update(**kwargs) + + +if __name__ == '__main__': + cli(obj=SharedOptions()) diff --git a/python/fate_test/fate_test/scripts/config_cli.py b/python/fate_test/fate_test/scripts/config_cli.py new file mode 100644 index 0000000000..55f0b4c61a --- /dev/null +++ b/python/fate_test/fate_test/scripts/config_cli.py @@ -0,0 +1,79 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pathlib import Path + +import click +from fate_test._client import Clients +from fate_test._config import create_config, default_config, parse_config +from fate_test.scripts._options import SharedOptions + + +@click.group("config", help="fate_test config") +def config_group(): + """ + config fate_test + """ + pass + + +@config_group.command(name="new") +def _new(): + """ + create new fate_test config temperate + """ + create_config(Path("fate_test_config.yaml")) + click.echo(f"create config file: fate_test_config.yaml") + + +@config_group.command(name="edit") +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def _edit(ctx, **kwargs): + """ + edit fate_test config file + """ + ctx.obj.update(**kwargs) + config = ctx.obj.get("config") + click.edit(filename=config) + + +@config_group.command(name="show") +def _show(): + """ + show fate_test default config path + """ + click.echo(f"default config path is {default_config()}") + + +@config_group.command(name="check") +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def _config(ctx, **kwargs): + """ + check connection + """ + ctx.obj.update(**kwargs) + config_inst = parse_config(ctx.obj.get("config")) + with Clients(config_inst) as clients: + roles = clients.all_roles() + for r in roles: + try: + version, address = clients[r].check_connection() + except Exception as e: + click.echo(f"[X]connection fail, role is {r}, exception is {e.args}") + else: + click.echo(f"[✓]connection {address} ok, fate version is {version}, role is {r}") diff --git a/python/fate_test/fate_test/scripts/data_cli.py b/python/fate_test/fate_test/scripts/data_cli.py new file mode 100644 index 0000000000..01d8c19c33 --- /dev/null +++ b/python/fate_test/fate_test/scripts/data_cli.py @@ -0,0 +1,435 @@ +import json +import os +import re +import sys +import time +import uuid +from datetime import timedelta +from pathlib import Path + +import click +# from fate_test._client import Clients +from fate_test._config import Config +from fate_test._io import LOGGER, echo +from fate_test.scripts._options import SharedOptions +from fate_test.scripts._utils import _load_testsuites, _delete_data, _big_data_task +from ruamel import yaml + +from fate_test import _config + + +@click.group(name="data") +def data_group(): + """ + upload or delete data in suite config files + """ + ... + + +@data_group.command("upload") +@click.option('-i', '--include', required=False, type=click.Path(exists=True), multiple=True, metavar="", + help="include *benchmark.json under these paths") +@click.option('-e', '--exclude', type=click.Path(exists=True), multiple=True, + help="exclude *benchmark.json under these paths") +@click.option("-t", "--config-type", type=click.Choice(["min_test", "all_examples"]), default="min_test", + help="config file") +@click.option('-g', '--glob', type=str, + help="glob string to filter sub-directory of path specified by ") +@click.option('-s', '--suite-type', required=False, type=click.Choice(["testsuite", "benchmark"]), default="testsuite", + help="suite type") +@click.option('-r', '--role', type=str, default='all', help="role to process, default to `all`. " + "use option likes: `guest_0`, `host_0`, `host`") +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def upload(ctx, include, exclude, glob, suite_type, role, config_type, **kwargs): + """ + upload data defined in suite config files + """ + ctx.obj.update(**kwargs) + ctx.obj.post_process() + namespace = ctx.obj["namespace"] + config_inst = ctx.obj["config"] + if ctx.obj["extend_sid"] is not None: + config_inst.extend_sid = ctx.obj["extend_sid"] + if ctx.obj["auto_increasing_sid"] is not None: + config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"] + yes = ctx.obj["yes"] + echo.welcome() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + if len(include) != 0: + echo.echo("loading testsuites:") + suffix = "benchmark.json" if suite_type == "benchmark" else "testsuite.json" + suites = _load_testsuites(includes=include, excludes=exclude, glob=glob, + suffix=suffix, suite_type=suite_type) + for suite in suites: + if role != "all": + suite.dataset = [d for d in suite.dataset if re.match(d.role_str, role)] + echo.echo(f"\tdataset({len(suite.dataset)}) {suite.path}") + if not yes and not click.confirm("running?"): + return + # client_upload(suites=suites, config_inst=config_inst, namespace=namespace) + # todo: upload with pipeline + else: + config = get_config(config_inst) + if config_type == 'min_test': + config_file = config.min_test_data_config + else: + config_file = config.all_examples_data_config + + with open(config_file, 'r', encoding='utf-8') as f: + upload_data = json.loads(f.read()) + + echo.echo(f"\tdataset({len(upload_data['data'])}) {config_file}") + if not yes and not click.confirm("running?"): + return + """with Clients(config_inst) as client: + data_upload(client, config_inst, upload_data)""" + # @todo: upload data with pipeline + echo.farewell() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + + +@data_group.command("delete") +@click.option('-i', '--include', required=True, type=click.Path(exists=True), multiple=True, metavar="", + help="include *benchmark.json under these paths") +@click.option('-e', '--exclude', type=click.Path(exists=True), multiple=True, + help="exclude *benchmark.json under these paths") +@click.option('-g', '--glob', type=str, + help="glob string to filter sub-directory of path specified by ") +@click.option('-s', '--suite-type', required=True, type=click.Choice(["testsuite", "benchmark"]), help="suite type") +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def delete(ctx, include, exclude, glob, yes, suite_type, **kwargs): + """ + delete data defined in suite config files + """ + ctx.obj.update(**kwargs) + ctx.obj.post_process() + namespace = ctx.obj["namespace"] + config_inst = ctx.obj["config"] + echo.welcome() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + echo.echo("loading testsuites:") + suffix = "benchmark.json" if suite_type == "benchmark" else "testsuite.json" + + suites = _load_testsuites(includes=include, excludes=exclude, glob=glob, + suffix=suffix, suite_type=suite_type) + if not yes and not click.confirm("running?"): + return + + for suite in suites: + echo.echo(f"\tdataset({len(suite.dataset)}) {suite.path}") + if not yes and not click.confirm("running?"): + return + with Clients(config_inst) as client: + for i, suite in enumerate(suites): + _delete_data(client, suite) + echo.farewell() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + + +@data_group.command("generate") +@click.option('-i', '--include', required=True, type=click.Path(exists=True), multiple=True, metavar="", + help="include *testsuite.json / *benchmark.json under these paths") +@click.option('-ht', '--host-data-type', default='tag_value', type=click.Choice(['dense', 'tag', 'tag_value']), + help="Select the format of the host data") +@click.option('-p', '--encryption-type', type=click.Choice(['sha256', 'md5']), + help="Entry ID encryption method for, sha256 and md5") +@click.option('-m', '--match-rate', default=1.0, type=float, + help="Intersection rate relative to guest, between [0, 1]") +@click.option('-s', '--sparsity', default=0.2, type=float, + help="The sparsity of tag data, The value is between (0-1)") +@click.option('-ng', '--guest-data-size', type=int, default=10000, + help="Set guest data set size, not less than 100") +@click.option('-nh', '--host-data-size', type=int, + help="Set host data set size, not less than 100") +@click.option('-fg', '--guest-feature-num', type=int, default=20, + help="Set guest feature dimensions") +@click.option('-fh', '--host-feature-num', type=int, default=200, + help="Set host feature dimensions; the default is equal to the number of guest's size") +@click.option('-o', '--output-path', type=click.Path(exists=True), + help="Customize the output path of generated data") +@click.option('--force', is_flag=True, default=False, + help="Overwrite existing file") +@click.option('--split-host', is_flag=True, default=False, + help="Divide the amount of host data equally among all the host tables in TestSuite") +@click.option('--upload-data', is_flag=True, default=False, + help="Generated data will be uploaded") +@click.option('--remove-data', is_flag=True, default=False, + help="The generated data will be deleted") +@click.option('--parallelize', is_flag=True, default=False, + help="It is directly used to upload data, and will not generate data") +@click.option('--use-local-data', is_flag=True, default=False, + help="The existing data of the server will be uploaded, This parameter is not recommended for " + "distributed applications") +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def generate(ctx, include, host_data_type, encryption_type, match_rate, sparsity, guest_data_size, + host_data_size, guest_feature_num, host_feature_num, output_path, force, split_host, upload_data, + remove_data, use_local_data, parallelize, **kwargs): + """ + create data defined in suite config files + """ + ctx.obj.update(**kwargs) + ctx.obj.post_process() + namespace = ctx.obj["namespace"] + config_inst = ctx.obj["config"] + if ctx.obj["extend_sid"] is not None: + config_inst.extend_sid = ctx.obj["extend_sid"] + if ctx.obj["auto_increasing_sid"] is not None: + config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"] + if parallelize and upload_data: + upload_data = False + yes = ctx.obj["yes"] + echo.welcome() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + echo.echo("loading testsuites:") + if host_data_size is None: + host_data_size = guest_data_size + suites = _load_testsuites(includes=include, excludes=tuple(), glob=None) + suites += _load_testsuites(includes=include, excludes=tuple(), glob=None, + suffix="benchmark.json", suite_type="benchmark") + for suite in suites: + if upload_data: + echo.echo(f"\tdataget({len(suite.dataset)}) dataset({len(suite.dataset)}) {suite.path}") + else: + echo.echo(f"\tdataget({len(suite.dataset)}) {suite.path}") + if not yes and not click.confirm("running?"): + return + + _big_data_task(include, guest_data_size, host_data_size, guest_feature_num, host_feature_num, host_data_type, + config_inst, encryption_type, match_rate, sparsity, force, split_host, output_path, parallelize) + if upload_data: + if use_local_data: + _config.use_local_data = 0 + _config.data_switch = remove_data + # client_upload(suites=suites, config_inst=config_inst, namespace=namespace, output_path=output_path) + # todo: upload with pipeline + + +@data_group.command("download") +@click.option("-t", "--type", type=click.Choice(["mnist"]), default="mnist", + help="config file") +@click.option('-o', '--output-path', type=click.Path(exists=True), + help="output path of mnist data, the default path is examples/data") +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def download_mnists(ctx, output_path, **kwargs): + """ + download mnist data for flow + """ + ctx.obj.update(**kwargs) + ctx.obj.post_process() + namespace = ctx.obj["namespace"] + config_inst = ctx.obj["config"] + yes = ctx.obj["yes"] + echo.welcome() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + + if output_path is None: + config = get_config(config_inst) + output_path = str(config.data_base_dir) + "/examples/data/" + if not yes and not click.confirm("running?"): + return + try: + download_mnist(Path(output_path), "mnist_train") + download_mnist(Path(output_path), "mnist_eval", is_train=False) + except Exception: + exception_id = uuid.uuid1() + echo.echo(f"exception_id={exception_id}") + LOGGER.exception(f"exception id: {exception_id}") + finally: + echo.stdout_newline() + echo.farewell() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + + +@data_group.command("query_schema") +@click.option('-cpn', '--component-name', required=False, type=str, help="component name", default='dataio_0') +@click.option('-j', '--job-id', required=True, type=str, help="job id") +@click.option('-r', '--role', required=True, type=click.Choice(["guest", "host", "arbiter"]), help="job id") +@click.option('-p', '--party-id', required=True, type=str, help="party id") +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def query_schema(ctx, component_name, job_id, role, party_id, **kwargs): + """ + query the meta of the output data of a component + """ + ctx.obj.update(**kwargs) + ctx.obj.post_process() + namespace = ctx.obj["namespace"] + yes = ctx.obj["yes"] + config_inst = ctx.obj["config"] + echo.welcome() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + + if not yes and not click.confirm("running?"): + return + # todo: upload data with pipeline + """with Clients(config_inst) as client: + query_component_output_data(client, config_inst, component_name, job_id, role, party_id)""" + echo.farewell() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + + +def get_config(conf: Config): + return conf + + +def query_component_output_data(clients, config: Config, component_name, job_id, role, party_id): + roles = config.role + clients_role = None + for k, v in roles.items(): + if int(party_id) in v and k == role: + clients_role = role + "_" + str(v.index(int(party_id))) + try: + if clients_role is None: + raise ValueError(f"party id {party_id} does not exist") + + try: + table_info = clients[clients_role].output_data_table(job_id=job_id, role=role, party_id=party_id, + component_name=component_name) + table_info = clients[clients_role].table_info(table_name=table_info['name'], + namespace=table_info['namespace']) + except Exception as e: + raise RuntimeError(f"An exception occurred while getting data {clients_role}<-{component_name}") from e + + echo.echo("query_component_output_data result: {}".format(table_info)) + try: + header = table_info['data']['schema']['header'] + except ValueError as e: + raise ValueError(f"Obtain header from table error, error msg: {e}") + + result = [] + for idx, header_name in enumerate(header[1:]): + result.append((idx, header_name)) + echo.echo("Queried header is {}".format(result)) + except Exception: + exception_id = uuid.uuid1() + echo.echo(f"exception_id={exception_id}") + LOGGER.exception(f"exception id: {exception_id}") + finally: + echo.stdout_newline() + + +def download_mnist(base, name, is_train=True): + import torchvision + + dataset = torchvision.datasets.MNIST( + root=base.joinpath(".cache"), train=is_train, download=True + ) + converted_path = base.joinpath(name) + converted_path.mkdir(exist_ok=True) + + inputs_path = converted_path.joinpath("images") + inputs_path.mkdir(exist_ok=True) + targets_path = converted_path.joinpath("targets") + config_path = converted_path.joinpath("config.yaml") + filenames_path = converted_path.joinpath("filenames") + + with filenames_path.open("w") as filenames: + with targets_path.open("w") as targets: + for idx, (img, target) in enumerate(dataset): + filename = f"{idx:05d}" + # save img + img.save(inputs_path.joinpath(f"{filename}.jpg")) + # save target + targets.write(f"{filename},{target}\n") + # save filenames + filenames.write(f"{filename}\n") + + config = { + "type": "vision", + "inputs": {"type": "images", "ext": "jpg", "PIL_mode": "L"}, + "targets": {"type": "integer"}, + } + with config_path.open("w") as f: + yaml.safe_dump(config, f, indent=2, default_flow_style=False) + + +"""def client_upload(suites, config_inst, namespace, output_path=None): + with Clients(config_inst) as client: + for i, suite in enumerate(suites): + # noinspection PyBroadException + try: + echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red') + try: + _upload_data(client, suite, config_inst, output_path) + except Exception as e: + raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e + except Exception: + exception_id = uuid.uuid1() + echo.echo(f"exception in {suite.path}, exception_id={exception_id}") + LOGGER.exception(f"exception id: {exception_id}") + finally: + echo.stdout_newline() + echo.farewell() + echo.echo(f"testsuite namespace: {namespace}", fg='red') +""" + + +def data_upload(clients, conf: Config, upload_config): + def _await_finish(job_id, task_name=None): + deadline = time.time() + sys.maxsize + start = time.time() + param = dict( + job_id=job_id, + role=None + ) + while True: + stdout = clients["guest_0"].flow_client("job/query", param) + status = stdout["data"][0]["f_status"] + elapse_seconds = int(time.time() - start) + date = time.strftime('%Y-%m-%d %X') + if task_name: + log_msg = f"[{date}][{task_name}]{status}, elapse: {timedelta(seconds=elapse_seconds)}" + else: + log_msg = f"[{date}]{job_id} {status}, elapse: {timedelta(seconds=elapse_seconds)}" + if (status == "running" or status == "waiting") and time.time() < deadline: + print(log_msg, end="\r") + time.sleep(1) + continue + else: + print(" " * 60, end="\r") # clean line + echo.echo(log_msg) + return status + + task_data = upload_config["data"] + for i, data in enumerate(task_data): + format_msg = f"@{data['file']} >> {data['namespace']}.{data['table_name']}" + echo.echo(f"[{time.strftime('%Y-%m-%d %X')}]uploading {format_msg}") + try: + data["file"] = str(os.path.join(conf.data_base_dir, data["file"])) + param = dict( + file=data["file"], + head=data["head"], + partition=data["partition"], + table_name=data["table_name"], + namespace=data["namespace"] + ) + stdout = clients["guest_0"].flow_client("data/upload", param, drop=1) + job_id = stdout.get('jobId', None) + echo.echo(f"[{time.strftime('%Y-%m-%d %X')}]upload done {format_msg}, job_id={job_id}\n") + if job_id is None: + echo.echo("table already exist. To upload again, Please add '-f 1' in start cmd") + continue + _await_finish(job_id) + param = dict( + table_name=data["table_name"], + namespace=data["namespace"] + ) + stdout = clients["guest_0"].flow_client("table/info", param) + + count = stdout["data"]["count"] + if count != data["count"]: + raise AssertionError("Count of upload file is not as expect, count is: {}," + "expect is: {}".format(count, data["count"])) + echo.echo(f"[{time.strftime('%Y-%m-%d %X')}] check_data_out {stdout} \n") + except Exception as e: + exception_id = uuid.uuid1() + echo.echo(f"exception in {data['file']}, exception_id={exception_id}") + LOGGER.exception(f"exception id: {exception_id}") + echo.echo(f"upload {i + 1}th data {data['table_name']} fail, exception_id: {exception_id}") + # raise RuntimeError(f"exception occur while uploading data for {data['file']}") from e + finally: + echo.stdout_newline() diff --git a/python/fate_test/fate_test/scripts/generate_mock_data.py b/python/fate_test/fate_test/scripts/generate_mock_data.py new file mode 100644 index 0000000000..c18e1ddf3b --- /dev/null +++ b/python/fate_test/fate_test/scripts/generate_mock_data.py @@ -0,0 +1,345 @@ +import functools +import hashlib +import json +import os +import random +import sys +import threading +import time +import uuid + +import numpy as np +import pandas as pd +from fate_test._config import Config +from fate_test._io import echo, LOGGER + + +def import_fate(): + from fate_arch import storage + from fate_flow.utils import data_utils + from fate_arch import session + from fate_arch.storage import StorageEngine + from fate_arch.common.conf_utils import get_base_config + from fate_arch.storage import EggRollStoreType + return storage, data_utils, session, StorageEngine, get_base_config, EggRollStoreType + + +storage, data_utils, session, StorageEngine, get_base_config, EggRollStoreType = import_fate() + +sys.setrecursionlimit(1000000) + + +class data_progress: + def __init__(self, down_load, time_start): + self.time_start = time_start + self.down_load = down_load + self.time_percent = 0 + self.switch = True + + def set_switch(self, switch): + self.switch = switch + + def get_switch(self): + return self.switch + + def set_time_percent(self, time_percent): + self.time_percent = time_percent + + def get_time_percent(self): + return self.time_percent + + def progress(self, percent): + if percent > 100: + percent = 100 + end = time.time() + if percent != 100: + print(f"\r{self.down_load} %.f%s [%s] running" % (percent, '%', self.timer(end - self.time_start)), + flush=True, end='') + else: + print(f"\r{self.down_load} %.f%s [%s] success" % (percent, '%', self.timer(end - self.time_start)), + flush=True, end='') + + @staticmethod + def timer(times): + hours, rem = divmod(times, 3600) + minutes, seconds = divmod(rem, 60) + return "{:0>2}:{:0>2}:{:0>2}".format(int(hours), int(minutes), int(seconds)) + + +def remove_file(path): + os.remove(path) + + +def id_encryption(encryption_type, start_num, end_num): + if encryption_type == 'md5': + return [hashlib.md5(bytes(str(value), encoding='utf-8')).hexdigest() for value in range(start_num, end_num)] + elif encryption_type == 'sha256': + return [hashlib.sha256(bytes(str(value), encoding='utf-8')).hexdigest() for value in range(start_num, end_num)] + else: + return [str(value) for value in range(start_num, end_num)] + + +def get_big_data(guest_data_size, host_data_size, guest_feature_num, host_feature_num, include_path, host_data_type, + conf: Config, encryption_type, match_rate, sparsity, force, split_host, output_path, parallelize): + global big_data_dir + + def list_tag_value(feature_nums, head): + # data = '' + # for f in range(feature_nums): + # data += head[f] + ':' + str(round(np.random.randn(), 4)) + ";" + # return data[:-1] + return ";".join([head[k] + ':' + str(round(v, 4)) for k, v in enumerate(np.random.randn(feature_nums))]) + + def list_tag(feature_nums, data_list): + data = '' + for f in range(feature_nums): + data += random.choice(data_list) + ";" + return data[:-1] + + def _generate_tag_value_data(data_path, start_num, end_num, feature_nums, progress): + data_num = end_num - start_num + section_data_size = round(data_num / 100) + iteration = round(data_num / section_data_size) + head = ['x' + str(i) for i in range(feature_nums)] + for batch in range(iteration + 1): + progress.set_time_percent(batch) + output_data = pd.DataFrame(columns=["id"]) + if section_data_size * (batch + 1) <= data_num: + output_data["id"] = id_encryption(encryption_type, section_data_size * batch + start_num, + section_data_size * (batch + 1) + start_num) + slicing_data_size = section_data_size + elif section_data_size * batch < data_num: + output_data['id'] = id_encryption(encryption_type, section_data_size * batch + start_num, end_num) + slicing_data_size = data_num - section_data_size * batch + else: + break + feature = [list_tag_value(feature_nums, head) for i in range(slicing_data_size)] + output_data['feature'] = feature + output_data.to_csv(data_path, mode='a+', index=False, header=False) + + def _generate_dens_data(data_path, start_num, end_num, feature_nums, label_flag, progress): + if label_flag: + head_1 = ['id', 'y'] + else: + head_1 = ['id'] + data_num = end_num - start_num + head_2 = ['x' + str(i) for i in range(feature_nums)] + df_data_1 = pd.DataFrame(columns=head_1) + head_data = pd.DataFrame(columns=head_1 + head_2) + head_data.to_csv(data_path, mode='a+', index=False) + section_data_size = round(data_num / 100) + iteration = round(data_num / section_data_size) + for batch in range(iteration + 1): + progress.set_time_percent(batch) + if section_data_size * (batch + 1) <= data_num: + df_data_1["id"] = id_encryption(encryption_type, section_data_size * batch + start_num, + section_data_size * (batch + 1) + start_num) + slicing_data_size = section_data_size + elif section_data_size * batch < data_num: + df_data_1 = pd.DataFrame(columns=head_1) + df_data_1["id"] = id_encryption(encryption_type, section_data_size * batch + start_num, end_num) + slicing_data_size = data_num - section_data_size * batch + else: + break + if label_flag: + df_data_1["y"] = [round(np.random.random()) for x in range(slicing_data_size)] + feature = np.random.randint(-10000, 10000, size=[slicing_data_size, feature_nums]) / 10000 + df_data_2 = pd.DataFrame(feature, columns=head_2) + output_data = pd.concat([df_data_1, df_data_2], axis=1) + output_data.to_csv(data_path, mode='a+', index=False, header=False) + + def _generate_tag_data(data_path, start_num, end_num, feature_nums, sparsity, progress): + data_num = end_num - start_num + section_data_size = round(data_num / 100) + iteration = round(data_num / section_data_size) + valid_set = [x for x in range(2019120799, 2019120799 + round(feature_nums / sparsity))] + data = list(map(str, valid_set)) + for batch in range(iteration + 1): + progress.set_time_percent(batch) + output_data = pd.DataFrame(columns=["id"]) + if section_data_size * (batch + 1) <= data_num: + output_data["id"] = id_encryption(encryption_type, section_data_size * batch + start_num, + section_data_size * (batch + 1) + start_num) + slicing_data_size = section_data_size + elif section_data_size * batch < data_num: + output_data["id"] = id_encryption(encryption_type, section_data_size * batch + start_num, end_num) + slicing_data_size = data_num - section_data_size * batch + else: + break + feature = [list_tag(feature_nums, data_list=data) for i in range(slicing_data_size)] + output_data['feature'] = feature + output_data.to_csv(data_path, mode='a+', index=False, header=False) + + def _generate_parallelize_data(start_num, end_num, feature_nums, table_name, namespace, label_flag, data_type, + partition, progress): + def expand_id_range(k, v): + if label_flag: + return [(id_encryption(encryption_type, ids, ids + 1)[0], + ",".join([str(round(np.random.random()))] + [str(round(i, 4)) for i in np.random.randn(v)])) + for ids in range(int(k), min(step + int(k), end_num))] + else: + if data_type == 'tag': + valid_set = [x for x in range(2019120799, 2019120799 + round(feature_nums / sparsity))] + data = list(map(str, valid_set)) + return [(id_encryption(encryption_type, ids, ids + 1)[0], + ";".join([random.choice(data) for i in range(int(v))])) + for ids in range(int(k), min(step + int(k), data_num))] + + elif data_type == 'tag_value': + return [(id_encryption(encryption_type, ids, ids + 1)[0], + ";".join([f"x{i}" + ':' + str(round(i, 4)) for i in np.random.randn(v)])) + for ids in range(int(k), min(step + int(k), data_num))] + elif data_type == 'dense': + return [(id_encryption(encryption_type, ids, ids + 1)[0], + ",".join([str(round(i, 4)) for i in np.random.randn(v)])) + for ids in range(int(k), min(step + int(k), data_num))] + + data_num = end_num - start_num + step = 10000 if data_num > 10000 else int(data_num / 10) + table_list = [(f"{i * step}", f"{feature_nums}") for i in range(int(data_num / step) + start_num)] + table = sess.computing.parallelize(table_list, partition=partition, include_key=True) + table = table.flatMap(functools.partial(expand_id_range)) + if label_flag: + schema = {"sid": "id", "header": ",".join(["y"] + [f"x{i}" for i in range(feature_nums)])} + else: + schema = {"sid": "id", "header": ",".join([f"x{i}" for i in range(feature_nums)])} + if data_type != "dense": + schema = None + + h_table = sess.get_table(name=table_name, namespace=namespace) + if h_table: + h_table.destroy() + + table_meta = sess.persistent(computing_table=table, name=table_name, namespace=namespace, schema=schema) + + storage_session = sess.storage() + s_table = storage_session.get_table(namespace=table_meta.get_namespace(), name=table_meta.get_name()) + if s_table.count() == data_num: + progress.set_time_percent(100) + from fate_flow.manager.data_manager import DataTableTracker + DataTableTracker.create_table_tracker( + table_name=table_name, + table_namespace=namespace, + entity_info={} + ) + + def data_save(data_info, table_names, namespaces, partition_list): + data_count = 0 + for idx, data_name in enumerate(data_info.keys()): + label_flag = True if 'guest' in data_info[data_name] else False + data_type = 'dense' if 'guest' in data_info[data_name] else host_data_type + if split_host and ('host' in data_info[data_name]): + host_end_num = int(np.ceil(host_data_size / len(data_info))) * (data_count + 1) if np.ceil( + host_data_size / len(data_info)) * (data_count + 1) <= host_data_size else host_data_size + host_start_num = int(np.ceil(host_data_size / len(data_info))) * data_count + data_count += 1 + else: + host_end_num = host_data_size + host_start_num = 0 + out_path = os.path.join(str(big_data_dir), data_name) + if os.path.exists(out_path) and os.path.isfile(out_path) and not parallelize: + if force: + remove_file(out_path) + else: + echo.echo('{} Already exists'.format(out_path)) + continue + data_i = (idx + 1) / len(data_info) + downLoad = f'dataget [{"#" * int(24 * data_i)}{"-" * (24 - int(24 * data_i))}] {idx + 1}/{len(data_info)}' + start = time.time() + progress = data_progress(downLoad, start) + thread = threading.Thread(target=run, args=[progress]) + thread.start() + + try: + if 'guest' in data_info[data_name]: + if not parallelize: + _generate_dens_data(out_path, guest_start_num, guest_end_num, + guest_feature_num, label_flag, progress) + else: + _generate_parallelize_data( + guest_start_num, + guest_end_num, + guest_feature_num, + table_names[idx], + namespaces[idx], + label_flag, + data_type, + partition_list[idx], + progress) + else: + if data_type == 'tag' and not parallelize: + _generate_tag_data(out_path, host_start_num, host_end_num, host_feature_num, sparsity, progress) + elif data_type == 'tag_value' and not parallelize: + _generate_tag_value_data(out_path, host_start_num, host_end_num, host_feature_num, progress) + elif data_type == 'dense' and not parallelize: + _generate_dens_data(out_path, host_start_num, host_end_num, + host_feature_num, label_flag, progress) + elif parallelize: + _generate_parallelize_data( + host_start_num, + host_end_num, + host_feature_num, + table_names[idx], + namespaces[idx], + label_flag, + data_type, + partition_list[idx], + progress) + progress.set_switch(False) + time.sleep(1) + except Exception: + exception_id = uuid.uuid1() + echo.echo(f"exception_id={exception_id}") + LOGGER.exception(f"exception id: {exception_id}") + finally: + progress.set_switch(False) + echo.stdout_newline() + + def run(p): + while p.get_switch(): + time.sleep(1) + p.progress(p.get_time_percent()) + + if not match_rate > 0 or not match_rate <= 1: + raise Exception(f"The value is between (0-1), Please check match_rate:{match_rate}") + guest_start_num = host_data_size - int(guest_data_size * match_rate) + guest_end_num = guest_start_num + guest_data_size + + if os.path.isfile(include_path): + with include_path.open("r") as f: + testsuite_config = json.load(f) + else: + raise Exception(f'Input file error, please check{include_path}.') + try: + if output_path is not None: + big_data_dir = os.path.abspath(output_path) + else: + big_data_dir = os.path.abspath(conf.cache_directory) + except Exception: + raise Exception('{}path does not exist'.format(big_data_dir)) + date_set = {} + table_name_list = [] + table_namespace_list = [] + partition_list = [] + for upload_dict in testsuite_config.get('data'): + date_set[os.path.basename(upload_dict.get('file'))] = upload_dict.get('role') + table_name_list.append(upload_dict.get('table_name')) + table_namespace_list.append(upload_dict.get('namespace')) + partition_list.append(upload_dict.get('partition', 8)) + + if parallelize: + with session.Session() as sess: + session_id = str(uuid.uuid1()) + sess.init_computing(session_id) + data_save( + data_info=date_set, + table_names=table_name_list, + namespaces=table_namespace_list, + partition_list=partition_list) + else: + data_save( + data_info=date_set, + table_names=table_name_list, + namespaces=table_namespace_list, + partition_list=partition_list) + echo.echo(f'Data storage address, please check{big_data_dir}') diff --git a/python/fate_test/fate_test/scripts/performance_cli.py b/python/fate_test/fate_test/scripts/performance_cli.py new file mode 100644 index 0000000000..7fe0ca5627 --- /dev/null +++ b/python/fate_test/fate_test/scripts/performance_cli.py @@ -0,0 +1,368 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import glob +import json +import os +import time +import uuid +from datetime import timedelta + +import click +from fate_test._client import Clients +from fate_test._config import Config +from fate_test._flow_client import JobProgress, SubmitJobResponse, QueryJobResponse +from fate_test._io import LOGGER, echo +from fate_test._parser import JSON_STRING, Testsuite +from fate_test.scripts._options import SharedOptions +from fate_test.scripts._utils import _load_testsuites, _upload_data, _delete_data, _load_module_from_script, \ + _add_replace_hook +from fate_test.utils import TxtStyle +from prettytable import PrettyTable, ORGMODE + +from fate_test import _config + + +@click.command("performance") +@click.option('-t', '--job-type', type=click.Choice(['intersect', 'intersect_multi', 'hetero_lr', 'hetero_sbt']), + help="Select the job type, you can also set through include") +@click.option('-i', '--include', type=click.Path(exists=True), multiple=True, metavar="", + help="include *testsuite.json under these paths") +@click.option('-r', '--replace', default="{}", type=JSON_STRING, + help="a json string represents mapping for replacing fields in data/conf/dsl") +@click.option('-m', '--timeout', type=int, default=3600, + help="maximun running time of job") +@click.option('-e', '--max-iter', type=int, help="When the algorithm model is LR, the number of iterations is set") +@click.option('-d', '--max-depth', type=int, + help="When the algorithm model is SecureBoost, set the number of model layers") +@click.option('-nt', '--num-trees', type=int, help="When the algorithm model is SecureBoost, set the number of trees") +@click.option('-p', '--task-cores', type=int, help="processors per node") +@click.option('-uj', '--update-job-parameters', default="{}", type=JSON_STRING, + help="a json string represents mapping for replacing fields in conf.job_parameters") +@click.option('-uc', '--update-component-parameters', default="{}", type=JSON_STRING, + help="a json string represents mapping for replacing fields in conf.component_parameters") +@click.option('-s', '--storage-tag', type=str, + help="tag for storing performance time consuming, for future comparison") +@click.option('-v', '--history-tag', type=str, multiple=True, + help="Extract performance time consuming from history tags for comparison") +@click.option("--skip-data", is_flag=True, default=False, + help="skip uploading data specified in testsuite") +@click.option("--provider", type=str, + help="Select the fate version, for example: fate@1.7") +@click.option("--disable-clean-data", "clean_data", flag_value=False, default=None) +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def run_task(ctx, job_type, include, replace, timeout, update_job_parameters, update_component_parameters, max_iter, + max_depth, num_trees, task_cores, storage_tag, history_tag, skip_data, clean_data, provider, **kwargs): + """ + Test the performance of big data tasks, alias: bp + """ + ctx.obj.update(**kwargs) + ctx.obj.post_process() + config_inst = ctx.obj["config"] + if ctx.obj["extend_sid"] is not None: + config_inst.extend_sid = ctx.obj["extend_sid"] + if ctx.obj["auto_increasing_sid"] is not None: + config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"] + namespace = ctx.obj["namespace"] + yes = ctx.obj["yes"] + data_namespace_mangling = ctx.obj["namespace_mangling"] + if clean_data is None: + clean_data = config_inst.clean_data + + def get_perf_template(conf: Config, job_type): + perf_dir = os.path.join(os.path.abspath(conf.perf_template_dir) + '/' + job_type + '/' + "*testsuite.json") + return glob.glob(perf_dir) + + if not include: + include = get_perf_template(config_inst, job_type) + # prepare output dir and json hooks + _add_replace_hook(replace) + + echo.welcome() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + echo.echo("loading testsuites:") + suites = _load_testsuites(includes=include, excludes=tuple(), glob=None, provider=provider) + for i, suite in enumerate(suites): + echo.echo(f"\tdataset({len(suite.dataset)}) dsl jobs({len(suite.jobs)}) {suite.path}") + + if not yes and not click.confirm("running?"): + return + + echo.stdout_newline() + with Clients(config_inst) as client: + + for i, suite in enumerate(suites): + # noinspection PyBroadException + try: + start = time.time() + echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red') + + if not skip_data: + try: + _upload_data(client, suite, config_inst) + except Exception as e: + raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e + + echo.stdout_newline() + try: + time_consuming = _submit_job(client, suite, namespace, config_inst, timeout, update_job_parameters, + storage_tag, history_tag, update_component_parameters, max_iter, + max_depth, num_trees, task_cores) + except Exception as e: + raise RuntimeError(f"exception occur while submit job for {suite.path}") from e + + try: + _run_pipeline_jobs(config_inst, suite, namespace, data_namespace_mangling) + except Exception as e: + raise RuntimeError(f"exception occur while running pipeline jobs for {suite.path}") from e + + echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red') + if not skip_data and clean_data: + _delete_data(client, suite) + echo.echo(suite.pretty_final_summary(time_consuming), fg='red') + + except Exception: + exception_id = uuid.uuid1() + echo.echo(f"exception in {suite.path}, exception_id={exception_id}") + LOGGER.exception(f"exception id: {exception_id}") + finally: + echo.stdout_newline() + + echo.farewell() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + + +def _submit_job(clients: Clients, suite: Testsuite, namespace: str, config: Config, timeout, update_job_parameters, + storage_tag, history_tag, update_component_parameters, max_iter, max_depth, num_trees, task_cores): + # submit jobs + with click.progressbar(length=len(suite.jobs), + label="jobs", + show_eta=False, + show_pos=True, + width=24) as bar: + time_list = [] + for job in suite.jobs_iter(): + start = time.time() + job_progress = JobProgress(job.job_name) + + def _raise(): + exception_id = str(uuid.uuid1()) + job_progress.exception(exception_id) + suite.update_status(job_name=job.job_name, exception_id=exception_id) + echo.file(f"exception({exception_id})") + LOGGER.exception(f"exception id: {exception_id}") + + # noinspection PyBroadException + try: + if max_iter is not None: + job.job_conf.update_component_parameters('max_iter', max_iter) + if max_depth is not None: + job.job_conf.update_component_parameters('max_depth', max_depth) + if num_trees is not None: + job.job_conf.update_component_parameters('num_trees', num_trees) + if task_cores is not None: + job.job_conf.update_job_common_parameters(task_cores=task_cores) + job.job_conf.update(config.parties, timeout, update_job_parameters, update_component_parameters) + except Exception: + _raise() + continue + + def update_bar(n_step): + bar.item_show_func = lambda x: job_progress.show() + time.sleep(0.1) + bar.update(n_step) + + update_bar(1) + + def _call_back(resp: SubmitJobResponse): + if isinstance(resp, SubmitJobResponse): + job_progress.submitted(resp.job_id) + echo.file(f"[jobs] {resp.job_id} ", nl=False) + suite.update_status(job_name=job.job_name, job_id=resp.job_id) + + if isinstance(resp, QueryJobResponse): + job_progress.running(resp.status, resp.progress) + + update_bar(0) + + # noinspection PyBroadException + try: + response = clients["guest_0"].submit_job(job=job, callback=_call_back) + + # noinspection PyBroadException + try: + # add notes + notes = f"{job.job_name}@{suite.path}@{namespace}" + for role, party_id_list in job.job_conf.role.items(): + for i, party_id in enumerate(party_id_list): + clients[f"{role}_{i}"].add_notes(job_id=response.job_id, role=role, party_id=party_id, + notes=notes) + except Exception: + pass + except Exception: + _raise() + else: + job_progress.final(response.status) + suite.update_status(job_name=job.job_name, status=response.status.status) + if response.status.is_success(): + if suite.model_in_dep(job.job_name): + dependent_jobs = suite.get_dependent_jobs(job.job_name) + for predict_job in dependent_jobs: + model_info, table_info, cache_info, model_loader_info = None, None, None, None + for i in _config.deps_alter[predict_job.job_name]: + if isinstance(i, dict): + name = i.get('name') + data_pre = i.get('data') + + if 'data_deps' in _config.deps_alter[predict_job.job_name]: + roles = list(data_pre.keys()) + table_info, hierarchy = [], [] + for role_ in roles: + role, index = role_.split("_") + input_ = data_pre[role_] + for data_input, cpn in input_.items(): + try: + table_name = clients["guest_0"].output_data_table( + job_id=response.job_id, + role=role, + party_id=config.role[role][int(index)], + component_name=cpn) + except Exception: + _raise() + if predict_job.job_conf.dsl_version == 2: + hierarchy.append([role, index, data_input]) + table_info.append({'table': table_name}) + else: + hierarchy.append([role, 'args', 'data']) + table_info.append({data_input: [table_name]}) + table_info = {'hierarchy': hierarchy, 'table_info': table_info} + if 'model_deps' in _config.deps_alter[predict_job.job_name]: + if predict_job.job_conf.dsl_version == 2: + # noinspection PyBroadException + try: + model_info = clients["guest_0"].deploy_model( + model_id=response.model_info["model_id"], + model_version=response.model_info["model_version"], + dsl=predict_job.job_dsl.as_dict()) + except Exception: + _raise() + else: + model_info = response.model_info + if 'cache_deps' in _config.deps_alter[predict_job.job_name]: + cache_dsl = predict_job.job_dsl.as_dict() + cache_info = [] + for cpn in cache_dsl.get("components").keys(): + if "CacheLoader" in cache_dsl.get("components").get(cpn).get("module"): + cache_info.append({cpn: {'job_id': response.job_id}}) + cache_info = {'hierarchy': [""], 'cache_info': cache_info} + if 'model_loader_deps' in _config.deps_alter[predict_job.job_name]: + model_loader_dsl = predict_job.job_dsl.as_dict() + model_loader_info = [] + for cpn in model_loader_dsl.get("components").keys(): + if "ModelLoader" in model_loader_dsl.get("components").get(cpn).get("module"): + model_loader_info.append({cpn: response.model_info}) + model_loader_info = {'hierarchy': [""], 'model_loader_info': model_loader_info} + + suite.feed_dep_info(predict_job, name, model_info=model_info, table_info=table_info, + cache_info=cache_info, model_loader_info=model_loader_info) + suite.remove_dependency(job.job_name) + update_bar(0) + time_consuming = time.time() - start + performance_dir = "/".join( + [os.path.join(os.path.abspath(config.cache_directory), 'benchmark_history', "performance.json")]) + fate_version = clients["guest_0"].get_version() + if history_tag: + history_tag = ["_".join([i, job.job_name]) for i in history_tag] + comparison_quality(job.job_name, history_tag, performance_dir, time_consuming) + if storage_tag: + storage_tag = "_".join(['FATE', fate_version, storage_tag, job.job_name]) + save_quality(storage_tag, performance_dir, time_consuming) + echo.stdout_newline() + time_list.append(time_consuming) + return [str(int(i)) + "s" for i in time_list] + + +def _run_pipeline_jobs(config: Config, suite: Testsuite, namespace: str, data_namespace_mangling: bool): + # pipeline demo goes here + job_n = len(suite.pipeline_jobs) + for i, pipeline_job in enumerate(suite.pipeline_jobs): + echo.echo(f"Running [{i + 1}/{job_n}] job: {pipeline_job.job_name}") + + def _raise(err_msg, status="failed"): + exception_id = str(uuid.uuid1()) + suite.update_status(job_name=job_name, exception_id=exception_id, status=status) + echo.file(f"exception({exception_id}), error message:\n{err_msg}") + # LOGGER.exception(f"exception id: {exception_id}") + + job_name, script_path = pipeline_job.job_name, pipeline_job.script_path + mod = _load_module_from_script(script_path) + try: + if data_namespace_mangling: + try: + mod.main(config=config, namespace=f"_{namespace}") + suite.update_status(job_name=job_name, status="success") + except Exception as e: + _raise(e) + continue + else: + try: + mod.main(config=config) + suite.update_status(job_name=job_name, status="success") + except Exception as e: + _raise(e) + continue + except Exception as e: + _raise(e, status="not submitted") + continue + + +def comparison_quality(group_name, history_tags, history_info_dir, time_consuming): + assert os.path.exists(history_info_dir), f"Please check the {history_info_dir} Is it deleted" + with open(history_info_dir, 'r') as f: + benchmark_quality = json.load(f, object_hook=dict) + benchmark_performance = {} + for history_tag in history_tags: + for tag in benchmark_quality: + if '_'.join(tag.split("_")[2:]) == history_tag: + benchmark_performance[tag] = benchmark_quality[tag] + if benchmark_performance is not None: + benchmark_performance[group_name] = time_consuming + + table = PrettyTable() + table.set_style(ORGMODE) + table.field_names = ["Script Model Name", "time consuming"] + for script_model_name in benchmark_performance: + table.add_row([f"{script_model_name}"] + + [f"{TxtStyle.FIELD_VAL}{benchmark_performance[script_model_name]}{TxtStyle.END}"]) + print("\n") + print(table.get_string(title=f"{TxtStyle.TITLE}Performance comparison results{TxtStyle.END}")) + print("#" * 60) + + +def save_quality(storage_tag, save_dir, time_consuming): + os.makedirs(os.path.dirname(save_dir), exist_ok=True) + if os.path.exists(save_dir): + with open(save_dir, 'r') as f: + benchmark_quality = json.load(f, object_hook=dict) + else: + benchmark_quality = {} + benchmark_quality.update({storage_tag: time_consuming}) + try: + with open(save_dir, 'w') as fp: + json.dump(benchmark_quality, fp, indent=2) + print("\n" + "Storage successful, please check: ", save_dir) + except Exception: + print("\n" + "Storage failed, please check: ", save_dir) diff --git a/python/fate_test/fate_test/scripts/quick_test_cli.py b/python/fate_test/fate_test/scripts/quick_test_cli.py new file mode 100644 index 0000000000..08f95e9964 --- /dev/null +++ b/python/fate_test/fate_test/scripts/quick_test_cli.py @@ -0,0 +1,95 @@ +import os +import subprocess + +import click +from fate_test._config import Config +from fate_test._io import echo +from fate_test.scripts._options import SharedOptions + + +@click.group(name="unittest") +def unittest_group(): + """ + unit test + """ + ... + + +@unittest_group.command("federatedml") +@click.option('-i', '--include', type=click.Path(exists=True), multiple=True, metavar="", + help="Specify federatedml test units for testing") +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def unit_test(ctx, include, **kwargs): + """ + federatedml unit test + """ + ctx.obj.update(**kwargs) + ctx.obj.post_process() + namespace = ctx.obj["namespace"] + config_inst = ctx.obj["config"] + yes = ctx.obj["yes"] + echo.echo(f"testsuite namespace: {namespace}", fg='red') + + if not yes and not click.confirm("running?"): + return + + error_log_file = f"./logs/{namespace}/error_test.log" + os.makedirs(os.path.dirname(error_log_file), exist_ok=True) + run_test(includes=include, conf=config_inst, error_log_file=error_log_file) + + +def run_test(includes, conf: Config, error_log_file): + def error_log(stdout): + if stdout is None: + return os.path.abspath(error_log_file) + with open(error_log_file, "a") as f: + f.write(stdout) + + def run_test(file): + global failed_count + echo.echo("start to run test {}".format(file)) + try: + subp = subprocess.Popen(["python", file], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout, stderr = subp.communicate() + stdout = stdout.decode("utf-8") + echo.echo(stdout) + if "FAILED" in stdout: + failed_count += 1 + error_log(stdout=f"error sequence {failed_count}: {file}") + error_log(stdout=stdout) + except Exception: + return + + def traverse_folder(file_fullname): + if os.path.isfile(file_fullname): + if "_test.py" in file_fullname and "ftl" not in file_fullname: + run_test(file_fullname) + else: + for file in os.listdir(file_fullname): + file_fullname_new = os.path.join(file_fullname, file) + if os.path.isdir(file_fullname_new): + traverse_folder(file_fullname_new) + if "_test.py" in file and ("/test" in file_fullname or "tests" in file_fullname): + if "ftl" in file_fullname_new: + continue + else: + run_test(file_fullname_new) + + global failed_count + failed_count = 0 + fate_base = conf.fate_base + ml_dir = os.path.join(fate_base, "python/federatedml") + PYTHONPATH = os.environ.get('PYTHONPATH') + ":" + os.path.join(fate_base, "python") + os.environ['PYTHONPATH'] = PYTHONPATH + if len(includes) == 0: + traverse_folder(ml_dir) + else: + ml_dir = includes + for v in ml_dir: + traverse_folder(os.path.abspath(v)) + + echo.echo(f"there are {failed_count} failed test") + if failed_count > 0: + print('Please check the error content: {}'.format(error_log(None))) diff --git a/python/fate_test/fate_test/scripts/testsuite_cli.py b/python/fate_test/fate_test/scripts/testsuite_cli.py new file mode 100644 index 0000000000..864ac17e53 --- /dev/null +++ b/python/fate_test/fate_test/scripts/testsuite_cli.py @@ -0,0 +1,165 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import time +import uuid +from datetime import timedelta + +import click +from fate_test._client import Clients +from fate_test._config import Config +from fate_test._io import LOGGER, echo +from fate_test._parser import Testsuite, non_success_summary +from fate_test.scripts._options import SharedOptions +from fate_test.scripts._utils import _load_testsuites, _upload_data, _delete_data, _load_module_from_script + +from fate_test import _config + +""" +@click.option('-uj', '--update-job-parameters', default="{}", type=JSON_STRING, + help="a json string represents mapping for replacing fields in conf.job_parameters") +@click.option('-uc', '--update-component-parameters', default="{}", type=JSON_STRING, + help="a json string represents mapping for replacing fields in conf.component_parameters") +@click.option('-m', '--timeout', type=int, default=3600, help="maximun running time of job") +@click.option('-p', '--task-cores', type=int, help="processors per node") +""" + + +@click.command("suite") +@click.option('-i', '--include', required=True, type=click.Path(exists=True), multiple=True, metavar="", + help="include *testsuite.json under these paths") +@click.option('-e', '--exclude', type=click.Path(exists=True), multiple=True, + help="exclude *testsuite.json under these paths") +@click.option("-g", '--glob', type=str, + help="glob string to filter sub-directory of path specified by ") +@click.option("--skip-jobs", is_flag=True, default=False, + help="skip pipeline jobs defined in testsuite") +@click.option("--skip-data", is_flag=True, default=False, + help="skip uploading data specified in testsuite") +@click.option("--data-only", is_flag=True, default=False, + help="upload data only") +@click.option("--provider", type=str, + help="Select the fate version, for example: fate@2.0-beta") +@click.option("--disable-clean-data", "clean_data", flag_value=False, default=None) +@click.option("--enable-clean-data", "clean_data", flag_value=True, default=None) +@SharedOptions.get_shared_options(hidden=True) +@click.pass_context +def run_suite(ctx, include, exclude, glob, + skip_jobs, skip_data, data_only, clean_data, provider, **kwargs): + """ + process testsuite + """ + ctx.obj.update(**kwargs) + ctx.obj.post_process() + config_inst = ctx.obj["config"] + """if ctx.obj["extend_sid"] is not None: + config_inst.extend_sid = ctx.obj["extend_sid"] + if ctx.obj["auto_increasing_sid"] is not None: + config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"]""" + if clean_data is None: + clean_data = config_inst.clean_data + namespace = ctx.obj["namespace"] + yes = ctx.obj["yes"] + data_namespace_mangling = ctx.obj["namespace_mangling"] + # prepare output dir and json hooks + # _add_replace_hook(replace) + echo.welcome() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + echo.echo("loading testsuites:") + suites = _load_testsuites(includes=include, excludes=exclude, glob=glob, provider=provider) + for suite in suites: + _config.jobs_num += len(suite.pipeline_jobs) + echo.echo(f"\tdataset({len(suite.dataset)}) " + f"pipeline jobs ({len(suite.pipeline_jobs)}) {suite.path}") + if not yes and not click.confirm("running?"): + return + + echo.stdout_newline() + with Clients(config_inst) as client: + for i, suite in enumerate(suites): + # noinspection PyBroadException + try: + start = time.time() + echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red') + if not skip_data and config_inst.work_mode: + try: + _upload_data(client, suite, config_inst) + except Exception as e: + raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e + if data_only: + continue + + if not skip_jobs: + try: + time_consuming = _run_pipeline_jobs(config_inst, suite, namespace, data_namespace_mangling) + except Exception as e: + raise RuntimeError(f"exception occur while running pipeline jobs for {suite.path}") from e + + if not skip_data and clean_data and config_inst.work_mode: + _delete_data(client, suite) + echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red') + if not skip_jobs: + suite_file = str(suite.path).split("/")[-1] + echo.echo(suite.pretty_final_summary(time_consuming, suite_file)) + + except Exception: + exception_id = uuid.uuid1() + echo.echo(f"exception in {suite.path}, exception_id={exception_id}") + LOGGER.exception(f"exception id: {exception_id}") + finally: + echo.stdout_newline() + non_success_summary() + echo.farewell() + echo.echo(f"testsuite namespace: {namespace}", fg='red') + + +def _run_pipeline_jobs(config: Config, suite: Testsuite, namespace: str, data_namespace_mangling: bool): + # pipeline demo goes here + job_n = len(suite.pipeline_jobs) + time_list = [] + for i, pipeline_job in enumerate(suite.pipeline_jobs): + echo.echo(f"Running [{i + 1}/{job_n}] job: {pipeline_job.job_name}") + + def _raise(err_msg, status="failed"): + exception_id = str(uuid.uuid1()) + suite.update_status(job_name=job_name, exception_id=exception_id, status=status) + echo.file(f"exception({exception_id}), error message:\n{err_msg}") + + job_name, script_path = pipeline_job.job_name, pipeline_job.script_path + mod = _load_module_from_script(script_path) + start = time.time() + try: + if data_namespace_mangling: + try: + mod.main(config=config, namespace=f"_{namespace}") + suite.update_status(job_name=job_name, status="success") + time_list.append(time.time() - start) + + except Exception as e: + _raise(e) + continue + else: + try: + mod.main(config=config) + suite.update_status(job_name=job_name, status="success") + time_list.append(time.time() - start) + except Exception as e: + _raise(e) + continue + except Exception as e: + _raise(e, status="not submitted") + continue + + return [str(int(i)) + "s" for i in time_list] diff --git a/python/fate_test/fate_test/utils.py b/python/fate_test/fate_test/utils.py new file mode 100644 index 0000000000..f33d7af74c --- /dev/null +++ b/python/fate_test/fate_test/utils.py @@ -0,0 +1,348 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import math +import os + +import numpy as np +from colorama import init, deinit, Fore, Style +from fate_test._io import echo +from prettytable import PrettyTable, ORGMODE + +SCRIPT_METRICS = "script_metrics" +DISTRIBUTION_METRICS = "distribution_metrics" +ALL = "all" +RELATIVE = "relative" +ABSOLUTE = "absolute" + + +class TxtStyle: + TRUE_VAL = Fore.GREEN + FALSE_VAL = Fore.RED + Style.BRIGHT + TITLE = Fore.BLUE + FIELD_VAL = Fore.YELLOW + DATA_FIELD_VAL = Fore.CYAN + END = Style.RESET_ALL + + +def show_data(data): + data_table = PrettyTable() + data_table.set_style(ORGMODE) + data_table.field_names = ["Data", "Information"] + for name, table_name in data.items(): + row = [name, f"{TxtStyle.DATA_FIELD_VAL}{table_name}{TxtStyle.END}"] + data_table.add_row(row) + echo.echo(data_table.get_string(title=f"{TxtStyle.TITLE}Data Summary{TxtStyle.END}")) + echo.echo("\n") + + +def _get_common_metrics(**results): + common_metrics = None + for result in results.values(): + if common_metrics is None: + common_metrics = set(result.keys()) + else: + common_metrics = common_metrics & result.keys() + if SCRIPT_METRICS in common_metrics: + common_metrics.remove(SCRIPT_METRICS) + return list(common_metrics) + + +def _filter_results(metrics, **results): + filtered_results = {} + for model_name, result in results.items(): + model_result = [result.get(metric, None) for metric in metrics] + if None in model_result: + continue + filtered_results[model_name] = model_result + return filtered_results + + +def style_table(txt): + colored_txt = txt.replace("True", f"{TxtStyle.TRUE_VAL}True{TxtStyle.END}") + colored_txt = colored_txt.replace("False", f"{TxtStyle.FALSE_VAL}False{TxtStyle.END}") + return colored_txt + + +def evaluate_almost_equal(metrics, results, abs_tol=None, rel_tol=None): + """ + Evaluate for each given metric if values in results are almost equal + Parameters + ---------- + metrics: List[str], metrics names + results: dict, results to be evaluated + abs_tol: float, absolute error tolerance + rel_tol: float, relative difference tolerance + Returns + ------- + bool, return True if all metrics in results are almost equal + """ + # return False if empty + if len(metrics) == 0: + return False + eval_summary = {} + for i, metric in enumerate(metrics): + v_eval = [res[i] for res in results.values()] + first_v = v_eval[0] + if metric == SCRIPT_METRICS: + continue + if abs_tol is not None and rel_tol is not None: + eval_summary[metric] = all(math.isclose(v, first_v, abs_tol=abs_tol, rel_tol=rel_tol) for v in v_eval) + elif abs_tol is not None: + eval_summary[metric] = all(math.isclose(v, first_v, abs_tol=abs_tol) for v in v_eval) + elif rel_tol is not None: + eval_summary[metric] = all(math.isclose(v, first_v, rel_tol=rel_tol) for v in v_eval) + else: + eval_summary[metric] = all(math.isclose(v, first_v) for v in v_eval) + all_match = all(eval_summary.values()) + return eval_summary, all_match + + +def _distribution_metrics(**results): + filtered_metric_group = _filter_results([DISTRIBUTION_METRICS], **results) + for script, model_results_pair in filtered_metric_group.items(): + metric_results = model_results_pair[0] + common_metrics = _get_common_metrics(**metric_results) + filtered_results = _filter_results(common_metrics, **metric_results) + table = PrettyTable() + table.set_style(ORGMODE) + script_model_names = list(filtered_results.keys()) + table.field_names = ["Script Model Name"] + common_metrics + for script_model_name in script_model_names: + row = [f"{script}-{script_model_name}"] + [f"{TxtStyle.FIELD_VAL}{v}{TxtStyle.END}" for v in + filtered_results[script_model_name]] + table.add_row(row) + echo.echo(table.get_string(title=f"{TxtStyle.TITLE}{script} distribution metrics{TxtStyle.END}")) + echo.echo("\n" + "#" * 60) + + +def match_script_metrics(abs_tol, rel_tol, match_details, **results): + filtered_metric_group = _filter_results([SCRIPT_METRICS], **results) + for script, model_results_pair in filtered_metric_group.items(): + metric_results = model_results_pair[0] + common_metrics = _get_common_metrics(**metric_results) + filtered_results = _filter_results(common_metrics, **metric_results) + table = PrettyTable() + table.set_style(ORGMODE) + script_model_names = list(filtered_results.keys()) + table.field_names = ["Script Model Name"] + common_metrics + for script_model_name in script_model_names: + row = [f"{script_model_name}-{script}"] + [f"{TxtStyle.FIELD_VAL}{v}{TxtStyle.END}" for v in + filtered_results[script_model_name]] + table.add_row(row) + echo.echo(table.get_string(title=f"{TxtStyle.TITLE}{script} Script Metrics Summary{TxtStyle.END}")) + _all_match(common_metrics, filtered_results, abs_tol, rel_tol, script, match_details=match_details) + + +def match_metrics(evaluate, group_name, abs_tol=None, rel_tol=None, storage_tag=None, history_tag=None, + fate_version=None, cache_directory=None, match_details=None, **results): + """ + Get metrics + Parameters + ---------- + evaluate: bool, whether to evaluate metrics are almost equal, and include compare results in output report + group_name: str, group name of all models + abs_tol: float, max tolerance of absolute error to consider two metrics to be almost equal + rel_tol: float, max tolerance of relative difference to consider two metrics to be almost equal + storage_tag: str, metrics information storage tag + history_tag: str, historical metrics information comparison tag + fate_version: str, FATE version + cache_directory: str, Storage path of metrics information + match_details: str, Error value display in algorithm comparison + results: dict of model name: metrics + Returns + ------- + match result + """ + init(autoreset=True) + common_metrics = _get_common_metrics(**results) + filtered_results = _filter_results(common_metrics, **results) + table = PrettyTable() + table.set_style(ORGMODE) + model_names = list(filtered_results.keys()) + table.field_names = ["Model Name"] + common_metrics + for model_name in model_names: + row = [f"{model_name}-{group_name}"] + [f"{TxtStyle.FIELD_VAL}{v}{TxtStyle.END}" for v in + filtered_results[model_name]] + table.add_row(row) + echo.echo(table.get_string(title=f"{TxtStyle.TITLE}Metrics Summary{TxtStyle.END}")) + + if evaluate and len(filtered_results.keys()) > 1: + _all_match(common_metrics, filtered_results, abs_tol, rel_tol, match_details=match_details) + + _distribution_metrics(**results) + match_script_metrics(abs_tol, rel_tol, match_details, **results) + if history_tag: + history_tag = ["_".join([i, group_name]) for i in history_tag] + comparison_quality(group_name, history_tag, cache_directory, abs_tol, rel_tol, match_details, **results) + if storage_tag: + storage_tag = "_".join(['FATE', fate_version, storage_tag, group_name]) + _save_quality(storage_tag, cache_directory, **results) + deinit() + + +def _match_error(metrics, results): + relative_error_list = [] + absolute_error_list = [] + if len(metrics) == 0: + return False + for i, v in enumerate(metrics): + v_eval = [res[i] for res in results.values()] + absolute_error_list.append(f"{TxtStyle.FIELD_VAL}{abs(max(v_eval) - min(v_eval))}{TxtStyle.END}") + relative_error_list.append( + f"{TxtStyle.FIELD_VAL}{abs((max(v_eval) - min(v_eval)) / max(v_eval))}{TxtStyle.END}") + return relative_error_list, absolute_error_list + + +def _all_match(common_metrics, filtered_results, abs_tol, rel_tol, script=None, match_details=None): + eval_summary, all_match = evaluate_almost_equal(common_metrics, filtered_results, abs_tol, rel_tol) + eval_table = PrettyTable() + eval_table.set_style(ORGMODE) + field_names = ["Metric", "All Match"] + relative_error_list, absolute_error_list = _match_error(common_metrics, filtered_results) + for i, metric in enumerate(eval_summary.keys()): + row = [metric, eval_summary.get(metric)] + if match_details == ALL: + field_names = ["Metric", "All Match", "max_relative_error", "max_absolute_error"] + row += [relative_error_list[i], absolute_error_list[i]] + elif match_details == RELATIVE: + field_names = ["Metric", "All Match", "max_relative_error"] + row += [relative_error_list[i]] + elif match_details == ABSOLUTE: + field_names = ["Metric", "All Match", "max_absolute_error"] + row += [absolute_error_list[i]] + eval_table.add_row(row) + eval_table.field_names = field_names + + echo.echo(style_table(eval_table.get_string(title=f"{TxtStyle.TITLE}Match Results{TxtStyle.END}"))) + script = "" if script is None else f"{script} " + if all_match: + echo.echo(f"All {script}Metrics Match: {TxtStyle.TRUE_VAL}{all_match}{TxtStyle.END}") + else: + echo.echo(f"All {script}Metrics Match: {TxtStyle.FALSE_VAL}{all_match}{TxtStyle.END}") + + +def comparison_quality(group_name, history_tags, cache_directory, abs_tol, rel_tol, match_details, **results): + def regression_group(results_dict): + metric = {} + for k, v in results_dict.items(): + if not isinstance(v, dict): + metric[k] = v + return metric + + def class_group(class_dict): + metric = {} + for k, v in class_dict.items(): + if not isinstance(v, dict): + metric[k] = v + for k, v in class_dict['distribution_metrics'].items(): + metric.update(v) + return metric + + history_info_dir = "/".join([os.path.join(os.path.abspath(cache_directory), 'benchmark_history', + "benchmark_quality.json")]) + assert os.path.exists(history_info_dir), f"Please check the {history_info_dir} Is it deleted" + with open(history_info_dir, 'r') as f: + benchmark_quality = json.load(f, object_hook=dict) + regression_metric = {} + regression_quality = {} + class_quality = {} + for history_tag in history_tags: + for tag in benchmark_quality: + if '_'.join(tag.split("_")[2:]) == history_tag and SCRIPT_METRICS in results["FATE"]: + regression_metric[tag] = regression_group(benchmark_quality[tag]['FATE']) + for key, value in _filter_results([SCRIPT_METRICS], **benchmark_quality[tag])['FATE'][0].items(): + regression_quality["_".join([tag, key])] = value + elif '_'.join(tag.split("_")[2:]) == history_tag and DISTRIBUTION_METRICS in results["FATE"]: + class_quality[tag] = class_group(benchmark_quality[tag]['FATE']) + + if SCRIPT_METRICS in results["FATE"] and regression_metric: + regression_metric[group_name] = regression_group(results['FATE']) + metric_compare(abs_tol, rel_tol, match_details, **regression_metric) + for key, value in _filter_results([SCRIPT_METRICS], **results)['FATE'][0].items(): + regression_quality["_".join([group_name, key])] = value + metric_compare(abs_tol, rel_tol, match_details, **regression_quality) + echo.echo("\n" + "#" * 60) + elif DISTRIBUTION_METRICS in results["FATE"] and class_quality: + + class_quality[group_name] = class_group(results['FATE']) + metric_compare(abs_tol, rel_tol, match_details, **class_quality) + echo.echo("\n" + "#" * 60) + + +def metric_compare(abs_tol, rel_tol, match_details, **metric_results): + common_metrics = _get_common_metrics(**metric_results) + filtered_results = _filter_results(common_metrics, **metric_results) + table = PrettyTable() + table.set_style(ORGMODE) + script_model_names = list(filtered_results.keys()) + table.field_names = ["Script Model Name"] + common_metrics + for script_model_name in script_model_names: + table.add_row([f"{script_model_name}"] + + [f"{TxtStyle.FIELD_VAL}{v}{TxtStyle.END}" for v in filtered_results[script_model_name]]) + print( + table.get_string(title=f"{TxtStyle.TITLE}Comparison results of all metrics of Script Model FATE{TxtStyle.END}")) + _all_match(common_metrics, filtered_results, abs_tol, rel_tol, match_details=match_details) + + +def _save_quality(storage_tag, cache_directory, **results): + save_dir = "/".join([os.path.join(os.path.abspath(cache_directory), 'benchmark_history', "benchmark_quality.json")]) + os.makedirs(os.path.dirname(save_dir), exist_ok=True) + if os.path.exists(save_dir): + with open(save_dir, 'r') as f: + benchmark_quality = json.load(f, object_hook=dict) + else: + benchmark_quality = {} + if storage_tag in benchmark_quality: + print("This tag already exists in the history and will be updated to the record information.") + benchmark_quality.update({storage_tag: results}) + try: + with open(save_dir, 'w') as fp: + json.dump(benchmark_quality, fp, indent=2) + print("Storage success, please check: ", save_dir) + except Exception: + print("Storage failed, please check: ", save_dir) + + +def parse_summary_result(rs_dict): + for model_key in rs_dict: + rs_content = rs_dict[model_key] + if 'validate' in rs_content: + return rs_content['validate'] + else: + return rs_content['train'] + + +def extract_data(df, col_name, convert_float=True, keep_id=False): + """ + component output data to numpy array + Parameters + ---------- + df: dataframe + col_name: column to extract + convert_float: whether to convert extracted value to float value + keep_id: whether to keep id + Returns + ------- + array of extracted data, optionally with id + """ + if keep_id: + if convert_float: + df[col_name] = df[col_name].to_numpy().astype(np.float64) + + return df[[df.columns[0], col_name]].to_numpy() + else: + return df[col_name].to_numpy().astype(np.float64) diff --git a/python/fate_test/pyproject.toml b/python/fate_test/pyproject.toml new file mode 100644 index 0000000000..2f4dbe9f7f --- /dev/null +++ b/python/fate_test/pyproject.toml @@ -0,0 +1,44 @@ +[tool.poetry] +name = "fate_test" +version = "2.0.0-beta" +description = "test tools for FATE" +authors = ["FederatedAI "] +license = "Apache-2.0" + +homepage = "https://fate.fedai.org/" +repository = "https://github.com/FederatedAI/FATE" +documentation = "https://fate.readthedocs.io/en/latest/?badge=latest" +keywords = ["FATE", "Federated Learning", "Testsuite"] + +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Topic :: Software Development :: Testing", + "Intended Audience :: Developers", + "Intended Audience :: Education" +] + +packages = [ + { include = "fate_test" } +] + +[tool.poetry.dependencies] +python = "^3.8" +requests_toolbelt = "^0.9.1" +requests = "^2.24.0" +click = "^7.1.2" +"ruamel.yaml" = "^0.16.10" +loguru = ">=0.6.0" +prettytable = "^1.0.0" +sshtunnel = "^0.1.5" +pandas = ">=1.1.5" +colorama = "^0.4.4" + +[tool.poetry.dev-dependencies] + +[tool.poetry.scripts] +fate_test = "fate_test.scripts.cli:cli" + +[build-system] +requires = ["poetry>=0.12", "setuptools>=50.0,<51.0"] +build-backend = "poetry.masonry.api" \ No newline at end of file diff --git a/python/fate_test/setup.py b/python/fate_test/setup.py new file mode 100644 index 0000000000..98898dbb41 --- /dev/null +++ b/python/fate_test/setup.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +from setuptools import setup + +packages = ["fate_test", "fate_test.scripts"] + +package_data = {"": ["*"]} + +install_requires = [ + "click>=7.1.2,<8.0.0", + "loguru>=0.6.0", + "pandas>=1.1.5", + "poetry>=0.12", + "prettytable>=1.0.0,<2.0.0", + # "requests>=2.24.0,<3.0.0", + # "requests_toolbelt>=0.9.1,<0.10.0", + "ruamel.yaml>=0.16.10,<0.17.0", + # "sshtunnel>=0.1.5,<0.2.0", + 'colorama>=0.4.4' +] + +entry_points = {"console_scripts": ["fate_test = fate_test.scripts.cli:cli"]} + +setup_kwargs = { + "name": "fate-test", + "version": "2.0.0-beta", + "description": "test tools for FATE", + "long_description": 'FATE Test\n=========\n\nA collection of useful tools to running FATE\'s test.\n\n.. image:: images/tutorial.gif\n :align: center\n :alt: tutorial\n\nquick start\n-----------\n\n1. (optional) create virtual env\n\n .. code-block:: bash\n\n python -m venv venv\n source venv/bin/activate\n pip install -U pip\n\n\n2. install fate_test\n\n .. code-block:: bash\n\n pip install fate_test\n fate_test --help\n\n\n3. edit default fate_test_config.yaml\n\n .. code-block:: bash\n\n # edit priority config file with system default editor\n # filling some field according to comments\n fate_test config edit\n\n4. configure FATE-Pipeline and FATE-Flow Commandline server setting\n\n.. code-block:: bash\n\n # configure FATE-Pipeline server setting\n pipeline init --port 9380 --ip 127.0.0.1\n # configure FATE-Flow Commandline server setting\n flow init --port 9380 --ip 127.0.0.1\n\n5. run some fate_test suite\n\n .. code-block:: bash\n\n fate_test suite -i \n\n\n6. run some fate_test benchmark\n\n .. code-block:: bash\n\n fate_test benchmark-quality -i \n\n7. useful logs or exception will be saved to logs dir with namespace shown in last step\n\ndevelop install\n---------------\nIt is more convenient to use the editable mode during development: replace step 2 with flowing steps\n\n.. code-block:: bash\n\n pip install -e ${FATE}/python/fate_client && pip install -e ${FATE}/python/fate_test\n\n\n\ncommand types\n-------------\n\n- suite: used for running testsuites, collection of FATE jobs\n\n .. code-block:: bash\n\n fate_test suite -i \n\n\n- benchmark-quality used for comparing modeling quality between FATE and other machine learning systems\n\n .. code-block:: bash\n\n fate_test benchmark-quality -i \n\n\n\nconfiguration by examples\n--------------------------\n\n1. no need ssh tunnel:\n\n - 9999, service: service_a\n - 10000, service: service_b\n\n and both service_a, service_b can be requested directly:\n\n .. code-block:: yaml\n\n work_mode: 1 # 0 for standalone, 1 for cluster\n data_base_dir: \n parties:\n guest: [10000]\n host: [9999, 10000]\n arbiter: [9999]\n services:\n - flow_services:\n - {address: service_a, parties: [9999]}\n - {address: service_b, parties: [10000]}\n\n2. need ssh tunnel:\n\n - 9999, service: service_a\n - 10000, service: service_b\n\n service_a, can be requested directly while service_b don\'t,\n but you can request service_b in other node, say B:\n\n .. code-block:: yaml\n\n work_mode: 0 # 0 for standalone, 1 for cluster\n data_base_dir: \n parties:\n guest: [10000]\n host: [9999, 10000]\n arbiter: [9999]\n services:\n - flow_services:\n - {address: service_a, parties: [9999]}\n - flow_services:\n - {address: service_b, parties: [10000]}\n ssh_tunnel: # optional\n enable: true\n ssh_address: :\n ssh_username: \n ssh_password: # optional\n ssh_priv_key: "~/.ssh/id_rsa"\n\n\nTestsuite\n---------\n\nTestsuite is used for running a collection of jobs in sequence. Data used for jobs could be uploaded before jobs are\nsubmitted, and are cleaned when jobs finished. This tool is useful for FATE\'s release test.\n\ncommand options\n~~~~~~~~~~~~~~~\n\n.. code-block:: bash\n\n fate_test suite --help\n\n1. include:\n\n .. code-block:: bash\n\n fate_test suite -i \n\n will run testsuites in *path1*\n\n2. exclude:\n\n .. code-block:: bash\n\n fate_test suite -i -e -e ...\n\n will run testsuites in *path1* but not in *path2* and *path3*\n\n3. glob:\n\n .. code-block:: bash\n\n fate_test suite -i -g "hetero*"\n\n will run testsuites in sub directory start with *hetero* of *path1*\n\n4. replace:\n\n .. code-block:: bash\n\n fate_test suite -i -r \'{"maxIter": 5}\'\n\n will find all key-value pair with key "maxIter" in `data conf` or `conf` or `dsl` and replace the value with 5\n\n\n5. skip-data:\n\n .. code-block:: bash\n\n fate_test suite -i --skip-data\n\n will run testsuites in *path1* without uploading data specified in *benchmark.json*.\n\n\n6. yes:\n\n .. code-block:: bash\n\n fate_test suite -i --yes\n\n will run testsuites in *path1* directly, skipping double check\n\n7. skip-dsl-jobs:\n\n .. code-block:: bash\n\n fate_test suite -i --skip-dsl-jobs\n\n will run testsuites in *path1* but skip all *tasks* in testsuites. It\'s would be useful when only pipeline tasks needed.\n\n8. skip-pipeline-jobs:\n\n .. code-block:: bash\n\n fate_test suite -i --skip-pipeline-jobs\n\n will run testsuites in *path1* but skip all *pipeline tasks* in testsuites. It\'s would be useful when only dsl tasks needed.\n\n\nBenchmark Quality\n------------------\n\nBenchmark-quality is used for comparing modeling quality between FATE\nand other machine learning systems. Benchmark produces a metrics comparison\nsummary for each benchmark job group.\n\n.. code-block:: bash\n\n fate_test benchmark-quality -i examples/benchmark_quality/hetero_linear_regression\n\n.. code-block:: bash\n\n +-------+--------------------------------------------------------------+\n | Data | Name |\n +-------+--------------------------------------------------------------+\n | train | {\'guest\': \'motor_hetero_guest\', \'host\': \'motor_hetero_host\'} |\n | test | {\'guest\': \'motor_hetero_guest\', \'host\': \'motor_hetero_host\'} |\n +-------+--------------------------------------------------------------+\n +------------------------------------+--------------------+--------------------+-------------------------+---------------------+\n | Model Name | explained_variance | r2_score | root_mean_squared_error | mean_squared_error |\n +------------------------------------+--------------------+--------------------+-------------------------+---------------------+\n | local-linear_regression-regression | 0.9035168452250094 | 0.9035070863155368 | 0.31340413289880553 | 0.09822215051805216 |\n | FATE-linear_regression-regression | 0.903146386539082 | 0.9031411831961411 | 0.3139977881119483 | 0.09859461093919596 |\n +------------------------------------+--------------------+--------------------+-------------------------+---------------------+\n +-------------------------+-----------+\n | Metric | All Match |\n +-------------------------+-----------+\n | explained_variance | True |\n | r2_score | True |\n | root_mean_squared_error | True |\n | mean_squared_error | True |\n +-------------------------+-----------+\n\ncommand options\n~~~~~~~~~~~~~~~\n\nuse the following command to show help message\n\n.. code-block:: bash\n\n fate_test benchmark-quality --help\n\n1. include:\n\n .. code-block:: bash\n\n fate_test benchmark-quality -i \n\n will run benchmark testsuites in *path1*\n\n2. exclude:\n\n .. code-block:: bash\n\n fate_test benchmark-quality -i -e -e ...\n\n will run benchmark testsuites in *path1* but not in *path2* and *path3*\n\n3. glob:\n\n .. code-block:: bash\n\n fate_test benchmark-quality -i -g "hetero*"\n\n will run benchmark testsuites in sub directory start with *hetero* of *path1*\n\n4. tol:\n\n .. code-block:: bash\n\n fate_test benchmark-quality -i -t 1e-3\n\n will run benchmark testsuites in *path1* with absolute tolerance of difference between metrics set to 0.001.\n If absolute difference between metrics is smaller than *tol*, then metrics are considered\n almost equal. Check benchmark testsuite `writing guide <#benchmark-testsuite>`_ on setting alternative tolerance.\n\n5. skip-data:\n\n .. code-block:: bash\n\n fate_test benchmark-quality -i --skip-data\n\n will run benchmark testsuites in *path1* without uploading data specified in *benchmark.json*.\n\n\n6. yes:\n\n .. code-block:: bash\n\n fate_test benchmark-quality -i --yes\n\n will run benchmark testsuites in *path1* directly, skipping double check\n\n\nbenchmark testsuite\n~~~~~~~~~~~~~~~~~~~\n\nConfiguration of jobs should be specified in a benchmark testsuite whose file name ends\nwith "\\*benchmark.json". For benchmark testsuite example,\nplease refer `here <../../examples/benchmark_quality>`_.\n\nA benchmark testsuite includes the following elements:\n\n- data: list of local data to be uploaded before running FATE jobs\n\n - file: path to original data file to be uploaded, should be relative to testsuite or FATE installation path\n - head: whether file includes header\n - partition: number of partition for data storage\n - table_name: table name in storage\n - namespace: table namespace in storage\n - role: which role to upload the data, as specified in fate_test.config;\n naming format is: "{role_type}_{role_index}", index starts at 0\n\n .. code-block:: json\n\n "data": [\n {\n "file": "examples/data/motor_hetero_host.csv",\n "head": 1,\n "partition": 8,\n "table_name": "motor_hetero_host",\n "namespace": "experiment",\n "role": "host_0"\n }\n ]\n\n- job group: each group includes arbitrary number of jobs with paths to corresponding script and configuration\n\n - job: name of job to be run, must be unique within each group list\n\n - script: path to `testing script <#testing-script>`_, should be relative to testsuite\n - conf: path to job configuration file for script, should be relative to testsuite\n\n .. code-block:: json\n\n "local": {\n "script": "./local-linr.py",\n "conf": "./linr_config.yaml"\n }\n\n - compare_setting: additional setting for quality metrics comparison, currently only takes ``relative_tol``\n\n If metrics *a* and *b* satisfy *abs(a-b) <= max(relative_tol \\* max(abs(a), abs(b)), absolute_tol)*\n (from `math module `_),\n they are considered almost equal. In the below example, metrics from "local" and "FATE" jobs are\n considered almost equal if their relative difference is smaller than\n *0.05 \\* max(abs(local_metric), abs(pipeline_metric)*.\n\n .. code-block:: json\n\n "linear_regression-regression": {\n "local": {\n "script": "./local-linr.py",\n "conf": "./linr_config.yaml"\n },\n "FATE": {\n "script": "./fate-linr.py",\n "conf": "./linr_config.yaml"\n },\n "compare_setting": {\n "relative_tol": 0.01\n }\n }\n\n\ntesting script\n~~~~~~~~~~~~~~\n\nAll job scripts need to have ``Main`` function as an entry point for executing jobs; scripts should\nreturn two dictionaries: first with data information key-value pairs: {data_type}: {data_name_dictionary};\nthe second contains {metric_name}: {metric_value} key-value pairs for metric comparison.\n\nBy default, the final data summary shows the output from the job named "FATE"; if no such job exists,\ndata information returned by the first job is shown. For clear presentation, we suggest that user follow\nthis general `guideline <../../examples/data/README.md#data-set-naming-rule>`_ for data set naming. In the case of multi-host\ntask, consider numbering host as such:\n\n::\n\n {\'guest\': \'default_credit_homo_guest\',\n \'host_1\': \'default_credit_homo_host_1\',\n \'host_2\': \'default_credit_homo_host_2\'}\n\nReturned quality metrics of the same key are to be compared.\nNote that only **real-value** metrics can be compared.\n\n- FATE script: ``Main`` always has three inputs:\n\n - config: job configuration, `JobConfig <../fate_client/pipeline/utils/tools.py#L64>`_ object loaded from "fate_test_config.yaml"\n - param: job parameter setting, dictionary loaded from "conf" file specified in benchmark testsuite\n - namespace: namespace suffix, user-given *namespace* or generated timestamp string when using *namespace-mangling*\n\n- non-FATE script: ``Main`` always has one input:\n\n - param: job parameter setting, dictionary loaded from "conf" file specified in benchmark testsuite\n\n\ndata\n----\n\n`Data` sub-command is used for upload or delete dataset in suite\'s.\n\ncommand options\n~~~~~~~~~~~~~~~\n\n.. code-block:: bash\n\n fate_test data --help\n\n1. include:\n\n .. code-block:: bash\n\n fate_test data [upload|delete] -i \n\n will upload/delete dataset in testsuites in *path1*\n\n2. exclude:\n\n .. code-block:: bash\n\n fate_test data [upload|delete] -i -e -e ...\n\n will upload/delete dataset in testsuites in *path1* but not in *path2* and *path3*\n\n3. glob:\n\n .. code-block:: bash\n\n fate_test data [upload|delete] -i -g "hetero*"\n\n will upload/delete dataset in testsuites in sub directory start with *hetero* of *path1*\n\n\nfull command options\n---------------------\n\n.. click:: fate_test.scripts.cli:cli\n :prog: fate_test\n :show-nested:\n', + "author": "FederatedAI", + "author_email": "contact@FedAI.org", + "maintainer": None, + "maintainer_email": None, + "url": "https://fate.fedai.org/", + "packages": packages, + "package_data": package_data, + "install_requires": install_requires, + "entry_points": entry_points, + "python_requires": ">=3.6,<4.0", +} + +setup(**setup_kwargs) From bf5d579f13883622e5cd6f4d3f5eaefa9471439f Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Thu, 27 Jul 2023 10:21:16 +0800 Subject: [PATCH 02/30] edit fate_test(#5008) Signed-off-by: Yu Wu --- python/fate_test/fate_test/_client.py | 48 ++++++------------- python/fate_test/fate_test/_config.py | 22 ++++----- python/fate_test/fate_test/_flow_client.py | 34 ++++++++----- python/fate_test/fate_test/_parser.py | 36 +++++++------- .../fate_test/scripts/performance_cli.py | 16 ++----- 5 files changed, 70 insertions(+), 86 deletions(-) diff --git a/python/fate_test/fate_test/_client.py b/python/fate_test/fate_test/_client.py index 84d623c4c3..d0abbb318e 100644 --- a/python/fate_test/fate_test/_client.py +++ b/python/fate_test/fate_test/_client.py @@ -14,61 +14,41 @@ # limitations under the License. # -import sshtunnel - from fate_test._flow_client import FLOWClient -from fate_test._io import LOGGER from fate_test._parser import Config class Clients(object): def __init__(self, config: Config): self._flow_clients = {} - self._tunnel_id_to_flow_clients = {} + # self._tunnel_id_to_flow_clients = {} self._role_str_to_service_id = {} - self._tunnel_id_to_tunnel = config.tunnel_id_to_tunnel + self._service_id_to_role_str = {} + self._service_id_to_party = {} + # self._tunnel_id_to_tunnel = config.tunnel_id_to_tunnel + for party, service_id in config.party_to_service_id.items(): + for role_str in config.parties.party_to_role_string(party): + self._role_str_to_service_id[role_str] = service_id + self._service_id_to_role_str[service_id] = role_str + self._service_id_to_party[service_id] = party for service_id, service in config.service_id_to_service.items(): if isinstance(service, Config.service): + role = self._service_id_to_role_str[service_id].split("_")[0] + party = self._service_id_to_party[service_id] self._flow_clients[service_id] = FLOWClient( - service.address, config.data_base_dir, config.cache_directory) + service.address, config.data_base_dir, config.cache_directory, role, party) - elif isinstance(service, Config.tunnel_service): + """elif isinstance(service, Config.tunnel_service): self._flow_clients[service_id] = FLOWClient(None, config.data_base_dir, config.cache_directory) self._tunnel_id_to_flow_clients.setdefault(service.tunnel_id, []).append( - (service.index, self._flow_clients[service_id])) - - for party, service_id in config.party_to_service_id.items(): - for role_str in config.parties.party_to_role_string(party): - self._role_str_to_service_id[role_str] = service_id + (service.index, self._flow_clients[service_id]))""" def __getitem__(self, role_str: str) -> 'FLOWClient': if role_str not in self._role_str_to_service_id: raise RuntimeError(f"no flow client found binding to {role_str}") return self._flow_clients[self._role_str_to_service_id[role_str]] - def __enter__(self): - # open ssh tunnels and create flow clients for remote - self._tunnels = [] - for tunnel_id, tunnel_conf in self._tunnel_id_to_tunnel.items(): - tunnel = sshtunnel.SSHTunnelForwarder(ssh_address_or_host=tunnel_conf.ssh_address, - ssh_username=tunnel_conf.ssh_username, - ssh_password=tunnel_conf.ssh_password, - ssh_pkey=tunnel_conf.ssh_priv_key, - remote_bind_addresses=tunnel_conf.services_address) - tunnel.start() - self._tunnels.append(tunnel) - for index, flow_client in self._tunnel_id_to_flow_clients[tunnel_id]: - flow_client.set_address(f"127.0.0.1:{tunnel.local_bind_ports[index]}") - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - for tunnel in self._tunnels: - try: - tunnel.stop() - except Exception as e: - LOGGER.exception(e) - def contains(self, role_str): return role_str in self._role_str_to_service_id diff --git a/python/fate_test/fate_test/_config.py b/python/fate_test/fate_test/_config.py index 7b26b69c3c..1d8cf8db56 100644 --- a/python/fate_test/fate_test/_config.py +++ b/python/fate_test/fate_test/_config.py @@ -49,9 +49,6 @@ # whether to delete data in suites after all jobs done clean_data: true -# work mode: 0 for standalone, 1 for cluster -work_mode: 0 - # participating parties' id and correponding flow service ip & port information parties: guest: [9999] @@ -184,14 +181,15 @@ def __init__(self, config): self.tunnel_id_to_tunnel = {} self.extend_sid = None self.auto_increasing_sid = None - self.work_mode = config.get("work_mode", 0) + # self.work_mode = config.get("work_mode", 0) tunnel_id = 0 service_id = 0 os.makedirs(os.path.dirname(self.cache_directory), exist_ok=True) for service_config in config["services"]: flow_services = service_config["flow_services"] - if service_config.get("ssh_tunnel", {}).get("enable", False): + # @todo: rm ssh tunnel; add host flow services + """if service_config.get("ssh_tunnel", {}).get("enable", False): tunnel_id += 1 services_address = [] for index, flow_service in enumerate(flow_services): @@ -209,13 +207,13 @@ def __init__(self, config): tunnel_config["ssh_password"], tunnel_config["ssh_priv_key"], services_address) - else: - for flow_service in flow_services: - service_id += 1 - address = flow_service["address"] - self.service_id_to_service[service_id] = self.service(address) - for party in flow_service["parties"]: - self.party_to_service_id[party] = service_id + else:""" + for flow_service in flow_services: + service_id += 1 + address = flow_service["address"] + self.service_id_to_service[service_id] = self.service(address) + for party in flow_service["parties"]: + self.party_to_service_id[party] = service_id @staticmethod def load(path: typing.Union[str, Path], **kwargs): diff --git a/python/fate_test/fate_test/_flow_client.py b/python/fate_test/fate_test/_flow_client.py index 0cfafb9d8e..280aac5323 100644 --- a/python/fate_test/fate_test/_flow_client.py +++ b/python/fate_test/fate_test/_flow_client.py @@ -71,12 +71,12 @@ def delete_data(self, data: Data): except Exception as e: raise RuntimeError(f"delete data failed") from e - def output_data_table(self, job_id, role, party_id, component_name): + """def output_data_table(self, job_id, role, party_id, component_name): result = self._output_data_table(job_id=job_id, role=role, party_id=party_id, component_name=component_name) - return result + return result""" - def table_info(self, table_name, namespace): - result = self._table_info(table_name=table_name, namespace=namespace) + def table_query(self, table_name, namespace): + result = self._table_query(table_name=table_name, namespace=namespace) return result def add_notes(self, job_id, role, party_id, notes): @@ -126,7 +126,7 @@ def _upload_data(self, data, output_path=None, verbose=0, destroy=1): partitions=data.partitions) return response - def _table_info(self, table_name, namespace): + """def _table_info(self, table_name, namespace): param = { 'table_name': table_name, 'namespace': namespace @@ -140,17 +140,25 @@ def _delete_data(self, table_name, namespace): 'namespace': namespace } response = self.flow_client(request='table/delete', param=param) + return response""" + + def _table_query(self, table_name, namespace): + response = self._client.table.query(namespace=namespace, table_name=table_name) + return response + + def _delete_data(self, table_name, namespace): + response = self._client.table.delete(namespace=namespace, table_name=table_name) return response - def _submit_job(self, conf, dsl): + """def _submit_job(self, conf, dsl): param = { 'job_dsl': self._save_json(dsl, 'submit_dsl.json'), 'job_runtime_conf': self._save_json(conf, 'submit_conf.json') } response = SubmitJobResponse(self.flow_client(request='job/submit', param=param)) - return response + return response""" - def _deploy_model(self, model_id, model_version, dsl=None): + """def _deploy_model(self, model_id, model_version, dsl=None): post_data = {'model_id': model_id, 'model_version': model_version, 'predict_dsl': dsl} @@ -166,9 +174,9 @@ def _deploy_model(self, model_id, model_version, dsl=None): except Exception as e: raise RuntimeError(f"deploy model error: {response}") from e - return result + return result""" - def _output_data_table(self, job_id, role, party_id, component_name): + """def _output_data_table(self, job_id, role, party_id, component_name): post_data = {'job_id': job_id, 'role': role, 'party_id': party_id, @@ -201,7 +209,7 @@ def _get_summary(self, job_id, role, party_id, component_name): result["summary_dir"] = retmsg # 获取summary文件位置 except Exception as e: raise RuntimeError(f"output data table error: {response}") from e - return result + return result""" """def _query_job(self, job_id, role): param = { @@ -269,7 +277,7 @@ def __repr__(self): return self.__str__() -"""class QueryJobResponse(object): +class QueryJobResponse(object): def __init__(self, response: dict): try: status = Status(response.get('data')[0]["f_status"]) @@ -277,7 +285,7 @@ def __init__(self, response: dict): except Exception as e: raise RuntimeError(f"query job error, response: {response}") from e self.status = status - self.progress = progress""" + self.progress = progress class UploadDataResponse(object): diff --git a/python/fate_test/fate_test/_parser.py b/python/fate_test/fate_test/_parser.py index 3ab001da29..fc1d832778 100644 --- a/python/fate_test/fate_test/_parser.py +++ b/python/fate_test/fate_test/_parser.py @@ -14,16 +14,16 @@ # limitations under the License. # -import json import typing from collections import deque from pathlib import Path -import click import prettytable from fate_test._config import Parties, Config from fate_test._io import echo from fate_test.utils import TxtStyle +# import json +from ruamel import yaml from fate_test import _config @@ -113,7 +113,8 @@ def as_dict(self): @staticmethod def load(path: Path): with path.open("r") as f: - kwargs = json.load(f, object_hook=CONF_JSON_HOOK.hook) + # kwargs = json.load(f, object_hook=CONF_JSON_HOOK.hook) + kwargs = yaml.safe_load(f) return JobConf(**kwargs) @property @@ -215,7 +216,8 @@ def __init__(self, components: dict, provider=None): @staticmethod def load(path: Path, provider): with path.open("r") as f: - kwargs = json.load(f, object_hook=DSL_JSON_HOOK.hook) + # kwargs = json.load(f, object_hook=DSL_JSON_HOOK.hook) + kwargs = yaml.safe_load(f) if provider is not None: kwargs["provider"] = provider return JobDSL(**kwargs) @@ -275,16 +277,16 @@ def load(cls, job_name, job_configs, base: Path, provider): job_name=job_name, job_conf=job_conf, job_dsl=job_dsl, pre_works=pre_works ) - @property + """@property def submit_params(self): return dict( conf=self.job_conf.as_dict(), dsl=self.job_dsl.as_dict() if self.job_dsl else None, - ) + )""" - def set_pre_work(self, name, **kwargs): + """def set_pre_work(self, name, **kwargs): self.job_conf.update_job_common_parameters(**kwargs) - self.job_conf.update_job_type("predict") + self.job_conf.update_job_type("predict")""" def set_input_data(self, hierarchys, table_info): for table_name, hierarchy in zip(table_info, hierarchys): @@ -337,7 +339,8 @@ def __init__( @staticmethod def load(path: Path, provider): with path.open("r") as f: - testsuite_config = json.load(f, object_hook=DATA_JSON_HOOK.hook) + # testsuite_config = json.load(f, object_hook=DATA_JSON_HOOK.hook) + testsuite_config = yaml.safe_load(f) dataset = [] for d in testsuite_config.get("data"): @@ -403,13 +406,13 @@ def pretty_final_summary(self, time_consuming, suite_file=None): def model_in_dep(self, name): return name in self._dependency - def get_dependent_jobs(self, name): - return self._dependency[name] + """def get_dependent_jobs(self, name): + return self._dependency[name]""" def remove_dependency(self, name): del self._dependency[name] - def feed_dep_info(self, job, name, model_info=None, table_info=None, cache_info=None, model_loader_info=None): + """def feed_dep_info(self, job, name, model_info=None, table_info=None, cache_info=None, model_loader_info=None): if model_info is not None: job.set_pre_work(name, **model_info) if table_info is not None: @@ -421,7 +424,7 @@ def feed_dep_info(self, job, name, model_info=None, table_info=None, cache_info= if name in job.pre_works: job.pre_works.remove(name) if job.is_submit_ready(): - self._ready_jobs.appendleft(job) + self._ready_jobs.appendleft(job)""" def reflash_configs(self, config: Config): failed = [] @@ -492,7 +495,8 @@ def __init__( @staticmethod def load(path: Path): with path.open("r") as f: - testsuite_config = json.load(f, object_hook=DATA_JSON_HOOK.hook) + # testsuite_config = json.load(f, object_hook=DATA_JSON_HOOK.hook) + testsuite_config = yaml.safe_load(f) dataset = [] for d in testsuite_config.get("data"): @@ -574,7 +578,7 @@ def _hook(d): return _hook -class JsonParamType(click.ParamType): +"""class JsonParamType(click.ParamType): name = "json_string" def convert(self, value, param, ctx): @@ -584,4 +588,4 @@ def convert(self, value, param, ctx): self.fail(f"{value} is not a valid json string", param, ctx) -JSON_STRING = JsonParamType() +JSON_STRING = JsonParamType()""" diff --git a/python/fate_test/fate_test/scripts/performance_cli.py b/python/fate_test/fate_test/scripts/performance_cli.py index 7fe0ca5627..338f66c868 100644 --- a/python/fate_test/fate_test/scripts/performance_cli.py +++ b/python/fate_test/fate_test/scripts/performance_cli.py @@ -23,9 +23,9 @@ import click from fate_test._client import Clients from fate_test._config import Config -from fate_test._flow_client import JobProgress, SubmitJobResponse, QueryJobResponse +from fate_test._flow_client import JobProgress, QueryJobResponse from fate_test._io import LOGGER, echo -from fate_test._parser import JSON_STRING, Testsuite +from fate_test._parser import Testsuite from fate_test.scripts._options import SharedOptions from fate_test.scripts._utils import _load_testsuites, _upload_data, _delete_data, _load_module_from_script, \ _add_replace_hook @@ -40,8 +40,6 @@ help="Select the job type, you can also set through include") @click.option('-i', '--include', type=click.Path(exists=True), multiple=True, metavar="", help="include *testsuite.json under these paths") -@click.option('-r', '--replace', default="{}", type=JSON_STRING, - help="a json string represents mapping for replacing fields in data/conf/dsl") @click.option('-m', '--timeout', type=int, default=3600, help="maximun running time of job") @click.option('-e', '--max-iter', type=int, help="When the algorithm model is LR, the number of iterations is set") @@ -49,10 +47,6 @@ help="When the algorithm model is SecureBoost, set the number of model layers") @click.option('-nt', '--num-trees', type=int, help="When the algorithm model is SecureBoost, set the number of trees") @click.option('-p', '--task-cores', type=int, help="processors per node") -@click.option('-uj', '--update-job-parameters', default="{}", type=JSON_STRING, - help="a json string represents mapping for replacing fields in conf.job_parameters") -@click.option('-uc', '--update-component-parameters', default="{}", type=JSON_STRING, - help="a json string represents mapping for replacing fields in conf.component_parameters") @click.option('-s', '--storage-tag', type=str, help="tag for storing performance time consuming, for future comparison") @click.option('-v', '--history-tag', type=str, multiple=True, @@ -187,11 +181,11 @@ def update_bar(n_step): update_bar(1) - def _call_back(resp: SubmitJobResponse): - if isinstance(resp, SubmitJobResponse): + def _call_back(resp): + """if isinstance(resp, SubmitJobResponse): job_progress.submitted(resp.job_id) echo.file(f"[jobs] {resp.job_id} ", nl=False) - suite.update_status(job_name=job.job_name, job_id=resp.job_id) + suite.update_status(job_name=job.job_name, job_id=resp.job_id)""" if isinstance(resp, QueryJobResponse): job_progress.running(resp.status, resp.progress) From 76f3b86e7c093769db57fbfd48b2705077ba031a Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Thu, 27 Jul 2023 19:17:06 +0800 Subject: [PATCH 03/30] fix fate-test testsuite(#5008) Signed-off-by: Yu Wu --- .../coordinated_lr_testsuite.yaml | 2 +- .../pipeline/coordinated_lr/test_lr_sid.py | 16 +- .../pipeline/coordinated_lr/test_lr_sid_cv.py | 8 +- .../coordinated_lr/test_lr_sid_warm_start.py | 8 +- python/fate_test/fate_test/_client.py | 12 +- python/fate_test/fate_test/_config.py | 21 -- python/fate_test/fate_test/_flow_client.py | 76 ++++-- python/fate_test/fate_test/_parser.py | 241 ++---------------- python/fate_test/fate_test/scripts/_utils.py | 19 +- .../fate_test/scripts/benchmark_cli.py | 48 ++-- .../fate_test/scripts/performance_cli.py | 64 ++--- .../fate_test/scripts/testsuite_cli.py | 66 ++--- 12 files changed, 189 insertions(+), 392 deletions(-) diff --git a/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml b/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml index 2de8a25b4f..029d8c6dfc 100644 --- a/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml +++ b/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml @@ -30,7 +30,7 @@ data: partitions: 4 head: true extend_sid: true - table_name: breast_hetero_host_sid + table_name: breast_hetero_host namespace: experiment role: host_0 tasks: diff --git a/examples/pipeline/coordinated_lr/test_lr_sid.py b/examples/pipeline/coordinated_lr/test_lr_sid.py index 9c7b31fb62..5fb0905ff1 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid.py +++ b/examples/pipeline/coordinated_lr/test_lr_sid.py @@ -33,10 +33,10 @@ def main(config="./config.yaml", namespace=""): pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest_sid", - namespace=f"{namespace}experiment")) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host_sid", - namespace=f"{namespace}experiment")) + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) lr_0 = CoordinatedLR("lr_0", epochs=4, batch_size=None, @@ -65,11 +65,11 @@ def main(config="./config.yaml", namespace=""): deployed_pipeline = pipeline.get_deployed_pipeline() deployed_pipeline.intersect_0.guest.component_setting( - input_data=DataWarehouseChannel(name="breast_hetero_guest_sid", - namespace=f"{namespace}experiment")) + input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) deployed_pipeline.intersect_0.hosts[0].component_setting( - input_data=DataWarehouseChannel(name="breast_hetero_host_sid", - namespace=f"{namespace}experiment")) + input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/coordinated_lr/test_lr_sid_cv.py b/examples/pipeline/coordinated_lr/test_lr_sid_cv.py index badfed7a39..16ce51d4a7 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid_cv.py +++ b/examples/pipeline/coordinated_lr/test_lr_sid_cv.py @@ -31,10 +31,10 @@ def main(config="./config.yaml", namespace=""): pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest_sid", - namespace=f"{namespace}experiment")) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host_sid", - namespace=f"{namespace}experiment")) + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) lr_0 = CoordinatedLR("lr_0", epochs=2, batch_size=100, diff --git a/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py b/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py index b9bf8401ef..fb8090064d 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py +++ b/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py @@ -32,10 +32,10 @@ def main(config="./config.yaml", namespace=""): pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest_sid", - namespace=f"{namespace}experiment")) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host_sid", - namespace=f"{namespace}experiment")) + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) lr_0 = CoordinatedLR("lr_0", epochs=4, batch_size=None, diff --git a/python/fate_test/fate_test/_client.py b/python/fate_test/fate_test/_client.py index d0abbb318e..b10b2cf363 100644 --- a/python/fate_test/fate_test/_client.py +++ b/python/fate_test/fate_test/_client.py @@ -23,21 +23,19 @@ def __init__(self, config: Config): self._flow_clients = {} # self._tunnel_id_to_flow_clients = {} self._role_str_to_service_id = {} - self._service_id_to_role_str = {} - self._service_id_to_party = {} + # self._service_id_to_role_str = {} + # self._service_id_to_party = {} # self._tunnel_id_to_tunnel = config.tunnel_id_to_tunnel for party, service_id in config.party_to_service_id.items(): for role_str in config.parties.party_to_role_string(party): self._role_str_to_service_id[role_str] = service_id - self._service_id_to_role_str[service_id] = role_str - self._service_id_to_party[service_id] = party + # self._service_id_to_role_str[service_id] = role_str + # self._service_id_to_party[service_id] = party for service_id, service in config.service_id_to_service.items(): if isinstance(service, Config.service): - role = self._service_id_to_role_str[service_id].split("_")[0] - party = self._service_id_to_party[service_id] self._flow_clients[service_id] = FLOWClient( - service.address, config.data_base_dir, config.cache_directory, role, party) + service.address, config.data_base_dir, config.cache_directory) """elif isinstance(service, Config.tunnel_service): self._flow_clients[service_id] = FLOWClient(None, config.data_base_dir, config.cache_directory) diff --git a/python/fate_test/fate_test/_config.py b/python/fate_test/fate_test/_config.py index 1d8cf8db56..b81b25e59e 100644 --- a/python/fate_test/fate_test/_config.py +++ b/python/fate_test/fate_test/_config.py @@ -183,31 +183,10 @@ def __init__(self, config): self.auto_increasing_sid = None # self.work_mode = config.get("work_mode", 0) - tunnel_id = 0 service_id = 0 os.makedirs(os.path.dirname(self.cache_directory), exist_ok=True) for service_config in config["services"]: flow_services = service_config["flow_services"] - # @todo: rm ssh tunnel; add host flow services - """if service_config.get("ssh_tunnel", {}).get("enable", False): - tunnel_id += 1 - services_address = [] - for index, flow_service in enumerate(flow_services): - service_id += 1 - address_host, address_port = flow_service["address"].split(":") - address_port = int(address_port) - services_address.append((address_host, address_port)) - self.service_id_to_service[service_id] = self.tunnel_service(tunnel_id, index) - for party in flow_service["parties"]: - self.party_to_service_id[party] = service_id - tunnel_config = service_config["ssh_tunnel"] - ssh_address_host, ssh_address_port = tunnel_config["ssh_address"].split(":") - self.tunnel_id_to_tunnel[tunnel_id] = self.tunnel((ssh_address_host, int(ssh_address_port)), - tunnel_config["ssh_username"], - tunnel_config["ssh_password"], - tunnel_config["ssh_priv_key"], - services_address) - else:""" for flow_service in flow_services: service_id += 1 address = flow_service["address"] diff --git a/python/fate_test/fate_test/_flow_client.py b/python/fate_test/fate_test/_flow_client.py index 280aac5323..2d0d3f8d98 100644 --- a/python/fate_test/fate_test/_flow_client.py +++ b/python/fate_test/fate_test/_flow_client.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import json import os import time import typing @@ -30,24 +31,27 @@ class FLOWClient(object): def __init__(self, address: typing.Optional[str], data_base_dir: typing.Optional[Path], - cache_directory: typing.Optional[Path], - role: str, - party_id: int): + cache_directory: typing.Optional[Path]): self.address = address - self.version = "2.0.0-beta" + self.version = "v2" self._client = FlowClient(self.address.split(':')[0], self.address.split(':')[1], self.version) self._data_base_dir = data_base_dir self._cache_directory = cache_directory self.data_size = 0 - self.role = role - self.party_id = party_id def set_address(self, address): self.address = address + def transform_local_file_to_dataframe(self, data: Data, callback=None, output_path=None): + data_warehouse = self.upload_data(data, callback, output_path) + status = self.transform_to_dataframe(data.namespace, data.table_name, data_warehouse, callback) + return status + def upload_data(self, data: Data, callback=None, output_path=None): - response = self._upload_data(data, output_path=output_path) + response, file_path = self._upload_data(data, output_path=output_path) try: + if callback is not None: + callback(response) code = response["code"] if code != 0: raise ValueError(f"Return code {code}!=0") @@ -57,24 +61,45 @@ def upload_data(self, data: Data, callback=None, output_path=None): job_id = response["job_id"] except BaseException: raise ValueError(f"Upload data fails, response={response}") - # self.monitor_status(job_id, role=self.role, party_id=self.party_id) - self._awaiting(job_id, self.role, self.party_id, ) + self._awaiting(job_id, "local", 0) + return dict(namespace=namespace, name=name) + def transform_to_dataframe(self, namespace, table_name, data_warehouse, callback=None): + response = self._client.data.dataframe_transformer(namespace=namespace, + name=table_name, + data_warehouse=data_warehouse) + + """try: + code = response["code"] + if code != 0: + raise ValueError(f"Return code {code}!=0") + job_id = response["job_id"] + except BaseException: + raise ValueError(f"Transform data fails, response={response}")""" + try: + if callback is not None: + callback(response) + status = self._awaiting(response["job_id"], "local", 0) + status = str(status).lower() + else: + status = response["retmsg"] + + except Exception as e: + raise RuntimeError(f"upload data failed") from e + job_id = response["job_id"] + self._awaiting(job_id, "local", 0) + return status + def delete_data(self, data: Data): - # @todo: use client.table.delete(table=, namespace=) try: table_name = data.config['table_name'] if data.config.get( 'table_name', None) is not None else data.config.get('name') - self._delete_data(table_name=table_name, namespace=data.config['namespace']) + self._client.table.delete(table_name=table_name, namespace=data.config['namespace']) except Exception as e: raise RuntimeError(f"delete data failed") from e - """def output_data_table(self, job_id, role, party_id, component_name): - result = self._output_data_table(job_id=job_id, role=role, party_id=party_id, component_name=component_name) - return result""" - def table_query(self, table_name, namespace): result = self._table_query(table_name=table_name, namespace=namespace) return result @@ -106,7 +131,7 @@ def _awaiting(self, job_id, role, party_id, callback=None): time.sleep(1) def _upload_data(self, data, output_path=None, verbose=0, destroy=1): - conf = data.conf + conf = data.config # if conf.get("engine", {}) != "PATH": if output_path is not None: conf['file'] = os.path.join(os.path.abspath(output_path), os.path.basename(conf.get('file'))) @@ -119,12 +144,12 @@ def _upload_data(self, data, output_path=None, verbose=0, destroy=1): if not path.exists(): raise Exception('The file is obtained from the fate flow client machine, but it does not exist, ' f'please check the path: {path}') - response = self._client.data.upload(file=data.file, + response = self._client.data.upload(file=str(path), head=data.head, meta=data.meta, extend_sid=data.extend_sid, partitions=data.partitions) - return response + return response, conf["file"] """def _table_info(self, table_name, namespace): param = { @@ -221,7 +246,7 @@ def _get_summary(self, job_id, role, party_id, component_name): def _query_job(self, job_id, role, party_id): response = self._client.job.query(job_id, role, party_id) - try: + """try: code = response["code"] if code != 0: raise ValueError(f"Return code {code}!=0") @@ -229,9 +254,10 @@ def _query_job(self, job_id, role, party_id): data = response["data"][0] return data except BaseException: - raise ValueError(f"query job is failed, response={response}") + raise ValueError(f"query job is failed, response={response}")""" + return QueryJobResponse(response) - def get_version(self): + """def get_version(self): response = self._post(url='version/get', json={"module": "FATE"}) try: retcode = response['retcode'] @@ -241,7 +267,7 @@ def get_version(self): fate_version = response["data"]["FATE"] except Exception as e: raise RuntimeError(f"get version error: {response}") from e - return fate_version + return fate_version""" def _add_notes(self, job_id, role, party_id, notes): data = dict(job_id=job_id, role=role, party_id=party_id, notes=notes) @@ -280,10 +306,10 @@ def __repr__(self): class QueryJobResponse(object): def __init__(self, response: dict): try: - status = Status(response.get('data')[0]["f_status"]) - progress = response.get('data')[0]['f_progress'] + status = Status(response.get('data')[0]["status"]) + progress = response.get('data')[0]['progress'] except Exception as e: - raise RuntimeError(f"query job error, response: {response}") from e + raise RuntimeError(f"query job error, response: {json.dumps(response, indent=4)}") from e self.status = status self.progress = progress diff --git a/python/fate_test/fate_test/_parser.py b/python/fate_test/fate_test/_parser.py index fc1d832778..9f411c948e 100644 --- a/python/fate_test/fate_test/_parser.py +++ b/python/fate_test/fate_test/_parser.py @@ -15,11 +15,10 @@ # import typing -from collections import deque from pathlib import Path import prettytable -from fate_test._config import Parties, Config +from fate_test._config import Config from fate_test._io import echo from fate_test.utils import TxtStyle # import json @@ -91,215 +90,8 @@ def load(config, path: Path): def update(self, config: Config): if config.extend_sid is not None: self.extend_sid = config.extend_sid - if config.meta is not None: - self.meta.update(config.meta) - - -class JobConf(object): - def __init__(self, initiator: dict, role: dict, job_parameters=None, **kwargs): - self.initiator = initiator - self.role = role - self.job_parameters = job_parameters if job_parameters else {} - self.others_kwargs = kwargs - - def as_dict(self): - return dict( - initiator=self.initiator, - role=self.role, - job_parameters=self.job_parameters, - **self.others_kwargs, - ) - - @staticmethod - def load(path: Path): - with path.open("r") as f: - # kwargs = json.load(f, object_hook=CONF_JSON_HOOK.hook) - kwargs = yaml.safe_load(f) - return JobConf(**kwargs) - - @property - def dsl_version(self): - return self.others_kwargs.get("dsl_version", 1) - - def update( - self, - parties: Parties, - timeout, - job_parameters, - component_parameters, - ): - self.initiator = parties.extract_initiator_role(self.initiator["role"]) - self.role = parties.extract_role( - {role: len(parties) for role, parties in self.role.items()} - ) - if timeout > 0: - self.update_job_common_parameters(timeout=timeout) - - if timeout > 0: - self.update_job_common_parameters(timeout=timeout) - - for key, value in job_parameters.items(): - self.update_parameters(parameters=self.job_parameters, key=key, value=value) - for key, value in component_parameters.items(): - if self.dsl_version == 1: - self.update_parameters( - parameters=self.others_kwargs.get("algorithm_parameters"), - key=key, - value=value, - ) - else: - self.update_parameters( - parameters=self.others_kwargs.get("component_parameters"), - key=key, - value=value, - ) - - def update_parameters(self, parameters, key, value): - if isinstance(parameters, dict): - for keys in parameters: - if keys == key: - parameters.get(key).update(value), - elif isinstance(parameters[keys], dict): - self.update_parameters(parameters[keys], key, value) - - def update_job_common_parameters(self, **kwargs): - if self.dsl_version == 1: - self.job_parameters.update(**kwargs) - else: - self.job_parameters.setdefault("common", {}).update(**kwargs) - - def update_job_type(self, job_type="predict"): - if self.dsl_version == 1: - if self.job_parameters.get("job_type", None) is None: - self.job_parameters.update({"job_type": job_type}) - else: - if self.job_parameters.setdefault("common", {}).get("job_type", None) is None: - self.job_parameters.setdefault("common", {}).update({"job_type": job_type}) - - def update_component_parameters(self, key, value, parameters=None): - if parameters is None: - if self.dsl_version == 1: - parameters = self.others_kwargs.get("algorithm_parameters") - else: - parameters = self.others_kwargs.get("component_parameters") - if isinstance(parameters, dict): - for keys in parameters: - if keys == key: - if isinstance(value, dict): - parameters[keys].update(value) - else: - parameters.update({key: value}) - elif ( - isinstance(parameters[keys], dict) and parameters[keys] is not None - ): - self.update_component_parameters(key, value, parameters[keys]) - - def get_component_parameters(self, keys): - if len(keys) == 0: - return self.others_kwargs.get("component_parameters") if self.dsl_version == 2 else self.others_kwargs.get( - "role_parameters") - if self.dsl_version == 1: - parameters = self.others_kwargs.get("role_parameters") - else: - parameters = self.others_kwargs.get("component_parameters").get("role") - - for key in keys: - parameters = parameters[key] - return parameters - - -class JobDSL(object): - def __init__(self, components: dict, provider=None): - self.components = components - self.provider = provider - - @staticmethod - def load(path: Path, provider): - with path.open("r") as f: - # kwargs = json.load(f, object_hook=DSL_JSON_HOOK.hook) - kwargs = yaml.safe_load(f) - if provider is not None: - kwargs["provider"] = provider - return JobDSL(**kwargs) - - def as_dict(self): - if self.provider is None: - return dict(components=self.components) - else: - return dict(components=self.components, provider=self.provider) - - -class Job(object): - def __init__( - self, - job_name: str, - job_conf: JobConf, - job_dsl: typing.Optional[JobDSL], - pre_works: list, - ): - self.job_name = job_name - self.job_conf = job_conf - self.job_dsl = job_dsl - self.pre_works = pre_works - - @classmethod - def load(cls, job_name, job_configs, base: Path, provider): - job_conf = JobConf.load(base.joinpath(job_configs.get("conf")).resolve()) - job_dsl = job_configs.get("dsl", None) - if job_dsl is not None: - job_dsl = JobDSL.load(base.joinpath(job_dsl).resolve(), provider) - - pre_works = [] - pre_works_value = {} - deps_dict = {} - - if job_configs.get("model_deps", None): - pre_works.append(job_configs["model_deps"]) - deps_dict["model_deps"] = {'name': job_configs["model_deps"]} - elif job_configs.get("deps", None): - pre_works.append(job_configs["deps"]) - deps_dict["model_deps"] = {'name': job_configs["deps"]} - if job_configs.get("data_deps", None): - deps_dict["data_deps"] = {'data': job_configs["data_deps"]} - pre_works.append(list(job_configs["data_deps"].keys())[0]) - deps_dict["data_deps"].update({'name': list(job_configs["data_deps"].keys())}) - if job_configs.get("cache_deps", None): - pre_works.append(job_configs["cache_deps"]) - deps_dict["cache_deps"] = {'name': job_configs["cache_deps"]} - if job_configs.get("model_loader_deps", None): - pre_works.append(job_configs["model_loader_deps"]) - deps_dict["model_loader_deps"] = {'name': job_configs["model_loader_deps"]} - - pre_works_value.update(deps_dict) - _config.deps_alter[job_name] = pre_works_value - - return Job( - job_name=job_name, job_conf=job_conf, job_dsl=job_dsl, pre_works=pre_works - ) - - """@property - def submit_params(self): - return dict( - conf=self.job_conf.as_dict(), - dsl=self.job_dsl.as_dict() if self.job_dsl else None, - )""" - - """def set_pre_work(self, name, **kwargs): - self.job_conf.update_job_common_parameters(**kwargs) - self.job_conf.update_job_type("predict")""" - - def set_input_data(self, hierarchys, table_info): - for table_name, hierarchy in zip(table_info, hierarchys): - key = list(table_name.keys())[0] - value = table_name[key] - self.job_conf.update_component_parameters( - key=key, - value=value, - parameters=self.job_conf.get_component_parameters(hierarchy), - ) - - def is_submit_ready(self): - return len(self.pre_works) == 0 + """if config.meta is not None: + self.meta.update(config.meta)""" class PipelineJob(object): @@ -321,11 +113,11 @@ def __init__( self.pipeline_jobs = pipeline_jobs self.path = path self.suite_name = Path(self.path).stem - - self._dependency: typing.MutableMapping[str, typing.List[Job]] = {} self._final_status: typing.MutableMapping[str, FinalStatus] = {} + """ + self._dependency: typing.MutableMapping[str, typing.List[Job]] = {} self._ready_jobs = deque() - """for job in self.jobs: + for job in self.jobs: for name in job.pre_works: self._dependency.setdefault(name, []).append(job) @@ -341,17 +133,14 @@ def load(path: Path, provider): with path.open("r") as f: # testsuite_config = json.load(f, object_hook=DATA_JSON_HOOK.hook) testsuite_config = yaml.safe_load(f) + # testsuite_config = DATA_JSON_HOOK.hook(testsuite_config) dataset = [] for d in testsuite_config.get("data"): - if "use_local_data" not in d: - d.update({"use_local_data": _config.use_local_data}) + d = DATA_JSON_HOOK.hook(d) + """if "use_local_data" not in d: + d.update({"use_local_data": _config.use_local_data})""" dataset.append(Data.load(d, path)) - """jobs = [] - for job_name, job_configs in testsuite_config.get("tasks", {}).items(): - jobs.append( - Job.load(job_name=job_name, job_configs=job_configs, base=path.parent, provider=provider) - )""" pipeline_jobs = [] if testsuite_config.get("tasks", None) is not None and provider is not None: @@ -363,9 +152,9 @@ def load(path: Path, provider): testsuite = Testsuite(dataset, pipeline_jobs, path) return testsuite - def jobs_iter(self) -> typing.Generator[Job, None, None]: + """def jobs_iter(self) -> typing.Generator[Job, None, None]: while self._ready_jobs: - yield self._ready_jobs.pop() + yield self._ready_jobs.pop()""" @staticmethod def style_table(txt): @@ -445,9 +234,9 @@ def update_status( setattr(self._final_status[job_name], k, v) def get_final_status(self): - for name, jobs in self._dependency.items(): + """for name, jobs in self._dependency.items(): for job in jobs: - self._final_status[job.job_name].rest_dependency.append(name) + self._final_status[job.job_name].rest_dependency.append(name)""" return self._final_status @@ -497,9 +286,11 @@ def load(path: Path): with path.open("r") as f: # testsuite_config = json.load(f, object_hook=DATA_JSON_HOOK.hook) testsuite_config = yaml.safe_load(f) + # testsuite_config = DATA_JSON_HOOK.hook(testsuite_config) dataset = [] for d in testsuite_config.get("data"): + d = DATA_JSON_HOOK.hook(d) dataset.append(Data.load(d, path)) pairs = [] diff --git a/python/fate_test/fate_test/scripts/_utils.py b/python/fate_test/fate_test/scripts/_utils.py index c087300515..cd3a04e5a0 100644 --- a/python/fate_test/fate_test/scripts/_utils.py +++ b/python/fate_test/fate_test/scripts/_utils.py @@ -12,8 +12,6 @@ from fate_test._io import echo, LOGGER, set_logger from fate_test._parser import Testsuite, BenchmarkSuite, DATA_JSON_HOOK, CONF_JSON_HOOK, DSL_JSON_HOOK -from fate_test import _config - def _big_data_task(includes, guest_data_size, host_data_size, guest_feature_num, host_feature_num, host_data_type, config_inst, encryption_type, match_rate, sparsity, force, split_host, output_path, parallelize): @@ -45,7 +43,7 @@ def _find_testsuite_files(path): match_rate, sparsity, force, split_host, output_path, parallelize) -def _load_testsuites(includes, excludes, glob, provider=None, suffix="testsuite.json", suite_type="testsuite"): +def _load_testsuites(includes, excludes, glob, provider=None, suffix="testsuite.yaml", suite_type="testsuite"): def _find_testsuite_files(path): if isinstance(path, str): path = Path(path) @@ -102,9 +100,7 @@ def _upload_data(clients: Clients, suite, config: Config, output_path=None): width=24) as bar: for i, data in enumerate(suite.dataset): data.update(config) - table_name = data.config['table_name'] if data.config.get( - 'table_name', None) is not None else data.config.get('name') - data_progress = DataProgress(f"{data.role_str}<-{data.config['namespace']}.{table_name}") + data_progress = DataProgress(f"{data.role_str}<-{data.namespace}.{data.table_name}") def update_bar(n_step): bar.item_show_func = lambda x: data_progress.show() @@ -121,16 +117,21 @@ def _call_back(resp): try: echo.stdout_newline() - status, data_path = clients[data.role_str].upload_data(data, _call_back, output_path) + # role, idx = data.role_str.lower().split("_") + # party_id = config.role[role][int(idx)] + status = clients[data.role_str].transform_local_file_to_dataframe(data, + _call_back, + output_path) time.sleep(1) data_progress.update() if status != 'success': raise RuntimeError(f"uploading {i + 1}th data for {suite.path} {status}") bar.update(1) - if _config.data_switch: + + """if _config.data_switch: from fate_test.scripts import generate_mock_data - generate_mock_data.remove_file(data_path) + generate_mock_data.remove_file(data_path)""" except Exception: exception_id = str(uuid.uuid1()) echo.file(f"exception({exception_id})") diff --git a/python/fate_test/fate_test/scripts/benchmark_cli.py b/python/fate_test/fate_test/scripts/benchmark_cli.py index 9030ed9818..d9f82d4139 100644 --- a/python/fate_test/fate_test/scripts/benchmark_cli.py +++ b/python/fate_test/fate_test/scripts/benchmark_cli.py @@ -66,34 +66,34 @@ def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, stora echo.echo(f"\tdataset({len(suite.dataset)}) benchmark groups({len(suite.pairs)}) {suite.path}") if not yes and not click.confirm("running?"): return - with Clients(config_inst) as client: - fate_version = client["guest_0"].get_version() - for i, suite in enumerate(suites): - # noinspection PyBroadException - try: - start = time.time() - echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red') - if not skip_data: - try: - _upload_data(client, suite, config_inst) - except Exception as e: - raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e + client = Clients(config_inst) + fate_version = client["guest_0"].get_version() + for i, suite in enumerate(suites): + # noinspection PyBroadException + try: + start = time.time() + echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red') + if not skip_data: try: - _run_benchmark_pairs(config_inst, suite, tol, namespace, data_namespace_mangling, storage_tag, - history_tag, fate_version, match_details) + _upload_data(client, suite, config_inst) except Exception as e: - raise RuntimeError(f"exception occur while running benchmark jobs for {suite.path}") from e + raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e + try: + _run_benchmark_pairs(config_inst, suite, tol, namespace, data_namespace_mangling, storage_tag, + history_tag, fate_version, match_details) + except Exception as e: + raise RuntimeError(f"exception occur while running benchmark jobs for {suite.path}") from e - if not skip_data and clean_data: - _delete_data(client, suite) - echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red') + if not skip_data and clean_data: + _delete_data(client, suite) + echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red') - except Exception: - exception_id = uuid.uuid1() - echo.echo(f"exception in {suite.path}, exception_id={exception_id}", err=True, fg='red') - LOGGER.exception(f"exception id: {exception_id}") - finally: - echo.stdout_newline() + except Exception: + exception_id = uuid.uuid1() + echo.echo(f"exception in {suite.path}, exception_id={exception_id}", err=True, fg='red') + LOGGER.exception(f"exception id: {exception_id}") + finally: + echo.stdout_newline() echo.farewell() echo.echo(f"testsuite namespace: {namespace}", fg='red') diff --git a/python/fate_test/fate_test/scripts/performance_cli.py b/python/fate_test/fate_test/scripts/performance_cli.py index 338f66c868..e07791cc9a 100644 --- a/python/fate_test/fate_test/scripts/performance_cli.py +++ b/python/fate_test/fate_test/scripts/performance_cli.py @@ -96,44 +96,44 @@ def get_perf_template(conf: Config, job_type): return echo.stdout_newline() - with Clients(config_inst) as client: + client = Clients(config_inst) - for i, suite in enumerate(suites): - # noinspection PyBroadException - try: - start = time.time() - echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red') - - if not skip_data: - try: - _upload_data(client, suite, config_inst) - except Exception as e: - raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e - - echo.stdout_newline() - try: - time_consuming = _submit_job(client, suite, namespace, config_inst, timeout, update_job_parameters, - storage_tag, history_tag, update_component_parameters, max_iter, - max_depth, num_trees, task_cores) - except Exception as e: - raise RuntimeError(f"exception occur while submit job for {suite.path}") from e + for i, suite in enumerate(suites): + # noinspection PyBroadException + try: + start = time.time() + echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red') + if not skip_data: try: - _run_pipeline_jobs(config_inst, suite, namespace, data_namespace_mangling) + _upload_data(client, suite, config_inst) except Exception as e: - raise RuntimeError(f"exception occur while running pipeline jobs for {suite.path}") from e + raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e - echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red') - if not skip_data and clean_data: - _delete_data(client, suite) - echo.echo(suite.pretty_final_summary(time_consuming), fg='red') + echo.stdout_newline() + try: + time_consuming = _submit_job(client, suite, namespace, config_inst, timeout, update_job_parameters, + storage_tag, history_tag, update_component_parameters, max_iter, + max_depth, num_trees, task_cores) + except Exception as e: + raise RuntimeError(f"exception occur while submit job for {suite.path}") from e - except Exception: - exception_id = uuid.uuid1() - echo.echo(f"exception in {suite.path}, exception_id={exception_id}") - LOGGER.exception(f"exception id: {exception_id}") - finally: - echo.stdout_newline() + try: + _run_pipeline_jobs(config_inst, suite, namespace, data_namespace_mangling) + except Exception as e: + raise RuntimeError(f"exception occur while running pipeline jobs for {suite.path}") from e + + echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red') + if not skip_data and clean_data: + _delete_data(client, suite) + echo.echo(suite.pretty_final_summary(time_consuming), fg='red') + + except Exception: + exception_id = uuid.uuid1() + echo.echo(f"exception in {suite.path}, exception_id={exception_id}") + LOGGER.exception(f"exception id: {exception_id}") + finally: + echo.stdout_newline() echo.farewell() echo.echo(f"testsuite namespace: {namespace}", fg='red') diff --git a/python/fate_test/fate_test/scripts/testsuite_cli.py b/python/fate_test/fate_test/scripts/testsuite_cli.py index 864ac17e53..f308ad1674 100644 --- a/python/fate_test/fate_test/scripts/testsuite_cli.py +++ b/python/fate_test/fate_test/scripts/testsuite_cli.py @@ -87,39 +87,41 @@ def run_suite(ctx, include, exclude, glob, return echo.stdout_newline() - with Clients(config_inst) as client: - for i, suite in enumerate(suites): - # noinspection PyBroadException - try: - start = time.time() - echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red') - if not skip_data and config_inst.work_mode: - try: - _upload_data(client, suite, config_inst) - except Exception as e: - raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e - if data_only: - continue + # with Clients(config_inst) as client: + client = Clients(config_inst) + + for i, suite in enumerate(suites): + # noinspection PyBroadException + try: + start = time.time() + echo.echo(f"[{i + 1}/{len(suites)}]start at {time.strftime('%Y-%m-%d %X')} {suite.path}", fg='red') + if not skip_data: + try: + _upload_data(client, suite, config_inst) + except Exception as e: + raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e + if data_only: + continue + + if not skip_jobs: + try: + time_consuming = _run_pipeline_jobs(config_inst, suite, namespace, data_namespace_mangling) + except Exception as e: + raise RuntimeError(f"exception occur while running pipeline jobs for {suite.path}") from e + + if not skip_data and clean_data: + _delete_data(client, suite) + echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red') + if not skip_jobs: + suite_file = str(suite.path).split("/")[-1] + echo.echo(suite.pretty_final_summary(time_consuming, suite_file)) - if not skip_jobs: - try: - time_consuming = _run_pipeline_jobs(config_inst, suite, namespace, data_namespace_mangling) - except Exception as e: - raise RuntimeError(f"exception occur while running pipeline jobs for {suite.path}") from e - - if not skip_data and clean_data and config_inst.work_mode: - _delete_data(client, suite) - echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red') - if not skip_jobs: - suite_file = str(suite.path).split("/")[-1] - echo.echo(suite.pretty_final_summary(time_consuming, suite_file)) - - except Exception: - exception_id = uuid.uuid1() - echo.echo(f"exception in {suite.path}, exception_id={exception_id}") - LOGGER.exception(f"exception id: {exception_id}") - finally: - echo.stdout_newline() + except Exception: + exception_id = uuid.uuid1() + echo.echo(f"exception in {suite.path}, exception_id={exception_id}") + LOGGER.exception(f"exception id: {exception_id}") + finally: + echo.stdout_newline() non_success_summary() echo.farewell() echo.echo(f"testsuite namespace: {namespace}", fg='red') From 0f48049f5b82c581df178db22e68d258d0c01d3e Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Fri, 28 Jul 2023 15:12:06 +0800 Subject: [PATCH 04/30] add benchmark example(#5008) Signed-off-by: Yu Wu --- examples/benchmark_quality/__init__.py | 0 examples/benchmark_quality/breast_config.yaml | 16 ++ .../default_credit_config.yaml | 16 ++ .../benchmark_quality/epsilon_5k_config.yaml | 16 ++ .../benchmark_quality/give_credit_config.yaml | 16 ++ examples/benchmark_quality/lr_benchmark.yaml | 162 ++++++++++++++++++ .../benchmark_quality/pipeline-lr-binary.py | 134 +++++++++++++++ .../benchmark_quality/pipeline-lr-multi.py | 116 +++++++++++++ .../benchmark_quality/sklearn-lr-binary.py | 90 ++++++++++ .../benchmark_quality/sklearn-lr-multi.py | 79 +++++++++ .../benchmark_quality/vehicle_config.yaml | 12 ++ .../vehicle_lr_sklearn_config.yaml | 12 ++ python/fate_test/fate_test/_parser.py | 27 +-- python/fate_test/fate_test/scripts/_utils.py | 12 +- 14 files changed, 682 insertions(+), 26 deletions(-) create mode 100644 examples/benchmark_quality/__init__.py create mode 100644 examples/benchmark_quality/breast_config.yaml create mode 100644 examples/benchmark_quality/default_credit_config.yaml create mode 100644 examples/benchmark_quality/epsilon_5k_config.yaml create mode 100644 examples/benchmark_quality/give_credit_config.yaml create mode 100644 examples/benchmark_quality/lr_benchmark.yaml create mode 100644 examples/benchmark_quality/pipeline-lr-binary.py create mode 100644 examples/benchmark_quality/pipeline-lr-multi.py create mode 100644 examples/benchmark_quality/sklearn-lr-binary.py create mode 100644 examples/benchmark_quality/sklearn-lr-multi.py create mode 100644 examples/benchmark_quality/vehicle_config.yaml create mode 100644 examples/benchmark_quality/vehicle_lr_sklearn_config.yaml diff --git a/examples/benchmark_quality/__init__.py b/examples/benchmark_quality/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/benchmark_quality/breast_config.yaml b/examples/benchmark_quality/breast_config.yaml new file mode 100644 index 0000000000..00090b4c16 --- /dev/null +++ b/examples/benchmark_quality/breast_config.yaml @@ -0,0 +1,16 @@ +data_guest: "examples/data/breast_hetero_guest.csv" +data_host: "examples/data/breast_hetero_host.csv" +idx: "id" +label_name: "y" +penalty: "L2" +epochs: 30 +learning_rate_scheduler: + method: "constant" + scheduler_params: + lr: 0.15 + factor: 1.0 + total_iters: 100 +optimizer: + method: "rmsprop" +batch_size: 5000 +early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/default_credit_config.yaml b/examples/benchmark_quality/default_credit_config.yaml new file mode 100644 index 0000000000..f86a48e834 --- /dev/null +++ b/examples/benchmark_quality/default_credit_config.yaml @@ -0,0 +1,16 @@ +data_guest: "examples/data/default_credit_hetero_guest.csv" +data_host: "examples/data/default_credit_hetero_host.csv" +idx: "id" +label_name: "y" +penalty: "L2" +epochs: 30 +learning_rate_scheduler: + method: "constant" + scheduler_params: + lr: 0.15 + factor: 1.0 + total_iters: 100 +optimizer: + method: "zeros" +batch_size: 500 +early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/epsilon_5k_config.yaml b/examples/benchmark_quality/epsilon_5k_config.yaml new file mode 100644 index 0000000000..be63b9b414 --- /dev/null +++ b/examples/benchmark_quality/epsilon_5k_config.yaml @@ -0,0 +1,16 @@ +data_guest: "examples/data/epsilon_5k_hetero_guest.csv" +data_host: "examples/data/epsilon_5k_hetero_host.csv" +idx: "id" +label_name: "y" +penalty: "L2" +epochs: 30 +learning_rate_scheduler: + method: "constant" + scheduler_params: + lr: 0.15 + factor: 1.0 + total_iters: 800 +optimizer: + method: "rmsprop" +batch_size: 5000 +early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/give_credit_config.yaml b/examples/benchmark_quality/give_credit_config.yaml new file mode 100644 index 0000000000..bc2b6a683f --- /dev/null +++ b/examples/benchmark_quality/give_credit_config.yaml @@ -0,0 +1,16 @@ +data_guest: "examples/data/give_credit_hetero_guest.csv" +data_host: "examples/data/give_credit_hetero_host.csv" +idx: "id" +label_name: "y" +penalty: "L2" +epochs: 6 +learning_rate_scheduler: + method: "constant" + scheduler_params: + lr: 0.15 + factor: 1.0 + total_iters: 100 +optimizer: + method: "adam" +batch_size: 550 +early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr_benchmark.yaml b/examples/benchmark_quality/lr_benchmark.yaml new file mode 100644 index 0000000000..dad81264ed --- /dev/null +++ b/examples/benchmark_quality/lr_benchmark.yaml @@ -0,0 +1,162 @@ +data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host + namespace: experiment + role: host_0 + - file: "../../data/default_credit_hetero_guest.csv" + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + label_type: int64 + label_name: y + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: default_credit_hetero_guest + namespace: experiment + role: guest_0 + - file: "../../data/default_credit_hetero_host.csv" + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: default_credit_hetero_host + namespace: experiment + role: host_0 + - file: "../../data/give_credit_hetero_guest.csv" + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + label_type: int64 + label_name: y + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: give_credit_hetero_guest + namespace: experiment + role: guest_0 + - file: "../../data/give_credit_hetero_host.csv" + head: 1 + partition: 16 + table_name: give_credit_hetero_host + namespace: experiment + role: host_0 + - file: "../../data/epsilon_5k_hetero_guest.csv" + head: 1 + partition: 16 + table_name: epsilon_5k_hetero_guest + namespace: experiment + role: guest_0 + - file: "../../data/epsilon_5k_hetero_host.csv" + head: 1 + partition: 16 + table_name: epsilon_5k_hetero_host + namespace: experiment + role: host_0 + - file: "../../data/vehicle_scale_hetero_guest.csv" + head: 1 + partition: 16 + table_name: vehicle_scale_hetero_guest + namespace: experiment + role: guest_0 + - file: "../../data/vehicle_scale_hetero_host.csv" + head: 1 + partition: 16 + table_name: vehicle_scale_hetero_host + namespace: experiment + role: host_0 +hetero_lr-binary-0: + local: + script: "./sklearn-lr-binary.py" + conf: "./breast_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-binary.py" + conf: "./breast_config.yaml" + compare_setting: + relative_tol: 0.01 +hetero_lr-binary-1: + local: + script: "./sklearn-lr-binary.py" + conf: "./default_credit_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-binary.py" + conf: "./default_credit_config.yaml" + compare_setting: + relative_tol: 0.01 +hetero_lr-binary-2: + local: + script: "./sklearn-lr-binary.py" + conf: "./epsilon_5k_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-binary.py" + conf: "./epsilon_5k_config.yaml" + compare_setting: + relative_tol: 0.01 +hetero_lr-binary-3: + local: + script: "./sklearn-lr-binary.py" + conf: "./give_credit_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-binary.py" + conf: "./give_credit_config.yaml" + compare_setting: + relative_tol: 0.01 +multi: + local: + script: "./sklearn-lr-multi.py" + conf: "./vehicle_lr_sklearn_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-multi.py" + conf: "./vehicle_config.yaml" + compare_setting: + relative_tol: 0.01 diff --git a/examples/benchmark_quality/pipeline-lr-binary.py b/examples/benchmark_quality/pipeline-lr-binary.py new file mode 100644 index 0000000000..7fa1f786ba --- /dev/null +++ b/examples/benchmark_quality/pipeline-lr-binary.py @@ -0,0 +1,134 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils +from fate_test.utils import extract_data, parse_summary_result +from federatedml.evaluation.metrics import classification_metric + + +def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): + # obtain config + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + + if isinstance(param, str): + param = test_utils.JobConfig.load_from_file(param) + + assert isinstance(param, dict) + + data_set = param.get("data_guest").split('/')[-1] + if data_set == "default_credit_hetero_guest.csv": + guest_data_table = 'default_credit_hetero_guest' + host_data_table = 'default_credit_hetero_host' + elif data_set == 'breast_hetero_guest.csv': + guest_data_table = 'breast_hetero_guest' + host_data_table = 'breast_hetero_host' + elif data_set == 'give_credit_hetero_guest.csv': + guest_data_table = 'give_credit_hetero_guest' + host_data_table = 'give_credit_hetero_host' + elif data_set == 'epsilon_5k_hetero_guest.csv': + guest_data_table = 'epsilon_5k_hetero_guest' + host_data_table = 'epsilon_5k_hetero_host' + else: + raise ValueError(f"Cannot recognized data_set: {data_set}") + + guest_train_data = {"name": guest_data_table, "namespace": f"experiment{namespace}"} + host_train_data = {"name": host_data_table, "namespace": f"experiment{namespace}"} + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + + intersect_0 = Intersection("intersect_0", method="raw") + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], + namespace=guest_train_data["namespace"])) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], + namespace=host_train_data["namespace"])) + + lr_param = { + } + + config_param = { + "penalty": param["penalty"], + "epochs": param["epochs"], + "learning_rate_scheduler": param["learning_rate_scheduler"], + "optimizer": param["optimizer"], + "batch_size": param["batch_size"], + "early_stop": "diff", + "tol": 1e-5, + "init_param": param.get("init_method", {"method": "zeros"}) + } + lr_param.update(config_param) + lr_0 = CoordinatedLR("lr_0", + train_data=intersect_0.outputs["output_data"], + **config_param) + lr_1 = CoordinatedLR("lr_1", + test_data=intersect_0.outputs["output_data"], + input_model=lr_0.outputs["train_output_model"]) + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="y", + runtime_roles=["guest"], + default_eval_setting="binary", + input_data=lr_0.outputs["train_output_data"]) + + pipeline.add_task(intersect_0) + pipeline.add_task(lr_0) + pipeline.add_task(lr_1) + pipeline.add_task(evaluation_0) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + lr_0_data = pipeline.get_component("lr_0").get_output_data() + lr_1_data = pipeline.get_component("lr_1").get_output_data() + lr_0_score = extract_data(lr_0_data, "predict_result") + lr_0_label = extract_data(lr_0_data, "label") + lr_1_score = extract_data(lr_1_data, "predict_result") + lr_1_label = extract_data(lr_1_data, "label") + lr_0_score_label = extract_data(lr_0_data, "predict_result", keep_id=True) + lr_1_score_label = extract_data(lr_1_data, "predict_result", keep_id=True) + result_summary = parse_summary_result(pipeline.get_component("evaluation_0").get_summary()) + metric_lr = { + "score_diversity_ratio": classification_metric.Distribution.compute(lr_0_score_label, lr_1_score_label), + "ks_2samp": classification_metric.KSTest.compute(lr_0_score, lr_1_score), + "mAP_D_value": classification_metric.AveragePrecisionScore().compute(lr_0_score, lr_1_score, lr_0_label, + lr_1_label)} + result_summary["distribution_metrics"] = {"hetero_lr": metric_lr} + + data_summary = {"train": {"guest": guest_train_data["name"], "host": host_train_data["name"]}, + "test": {"guest": guest_train_data["name"], "host": host_train_data["name"]} + } + + return data_summary, result_summary + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("BENCHMARK-QUALITY PIPELINE JOB") + parser.add_argument("-c", "--config", type=str, + help="config file", default="../../config.yaml") + parser.add_argument("-p", "--param", type=str, + help="config file for params", default="./breast_config.yaml") + args = parser.parse_args() + main(args.config, args.param) diff --git a/examples/benchmark_quality/pipeline-lr-multi.py b/examples/benchmark_quality/pipeline-lr-multi.py new file mode 100644 index 0000000000..f774515dc1 --- /dev/null +++ b/examples/benchmark_quality/pipeline-lr-multi.py @@ -0,0 +1,116 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils +from fate_test.utils import extract_data, parse_summary_result +from federatedml.evaluation.metrics import classification_metric + + +def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): + # obtain config + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + + if isinstance(param, str): + param = test_utils.JobConfig.load_from_file(param) + + assert isinstance(param, dict) + data_set = param.get("data_guest").split('/')[-1] + if data_set == "vehicle_scale_hetero_guest.csv": + guest_data_table = 'vehicle_scale_hetero_guest' + host_data_table = 'vehicle_scale_hetero_host' + else: + raise ValueError(f"Cannot recognized data_set: {data_set}") + + guest_train_data = {"name": guest_data_table, "namespace": f"experiment{namespace}"} + host_train_data = {"name": host_data_table, "namespace": f"experiment{namespace}"} + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + + intersect_0 = Intersection("intersect_0", method="raw") + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], + namespace=guest_train_data["namespace"])) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], + namespace=host_train_data["namespace"])) + + lr_param = { + } + + config_param = { + "penalty": param["penalty"], + "epochs": param["epochs"], + "learning_rate_scheduler": param["learning_rate_scheduler"], + "optimizer": param["optimizer"], + "batch_size": param["batch_size"], + "early_stop": "diff", + "tol": 1e-5, + "init_param": param.get("init_method", {"method": "zeros"}) + } + lr_param.update(config_param) + lr_0 = CoordinatedLR("lr_0", + train_data=intersect_0.outputs["output_data"], + **config_param) + lr_1 = CoordinatedLR("lr_1", + test_data=intersect_0.outputs["output_data"], + input_model=lr_0.outputs["train_output_model"]) + + evaluation_0 = Evaluation('evaluation_0', default_eval_setting="multi") + pipeline.add_task(intersect_0) + pipeline.add_task(lr_0) + pipeline.add_task(lr_1) + pipeline.add_task(evaluation_0) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + lr_0_data = pipeline.get_component("lr_0").get_output_data() + lr_1_data = pipeline.get_component("lr_1").get_output_data() + + result_summary = parse_summary_result(pipeline.get_component("evaluation_0").get_summary()) + lr_0_score_label = extract_data(lr_0_data, "predict_result", keep_id=True) + lr_1_score_label = extract_data(lr_1_data, "predict_result", keep_id=True) + metric_lr = { + "score_diversity_ratio": classification_metric.Distribution.compute(lr_0_score_label, lr_1_score_label)} + result_summary["distribution_metrics"] = {"hetero_lr": metric_lr} + + data_summary = {"train": {"guest": guest_train_data["name"], "host": host_train_data["name"]}, + "test": {"guest": guest_train_data["name"], "host": host_train_data["name"]} + } + return data_summary, result_summary + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("BENCHMARK-QUALITY PIPELINE JOB") + parser.add_argument("-c", "--config", type=str, + help="config file", default="../../config.yaml") + parser.add_argument("-p", "--param", type=str, + help="config file for params", default="./vehicle_config.yaml") + + args = parser.parse_args() + if args.config is not None: + main(args.config, args.param) + else: + main() diff --git a/examples/benchmark_quality/sklearn-lr-binary.py b/examples/benchmark_quality/sklearn-lr-binary.py new file mode 100644 index 0000000000..5b17692621 --- /dev/null +++ b/examples/benchmark_quality/sklearn-lr-binary.py @@ -0,0 +1,90 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import os + +import pandas +from pipeline.utils.tools import JobConfig +from sklearn.linear_model import SGDClassifier +from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, recall_score, roc_curve + + +def main(config="../../config.yaml", param="./vechile_config.yaml"): + # obtain config + if isinstance(param, str): + param = JobConfig.load_from_file(param) + assert isinstance(param, dict) + data_guest = param["data_guest"] + data_host = param["data_host"] + idx = param["idx"] + label_name = param["label_name"] + + if isinstance(config, str): + config = JobConfig.load_from_file(config) + print(f"config: {config}") + data_base_dir = config["data_base_dir"] + else: + data_base_dir = config.data_base_dir + + config_param = { + "penalty": param["penalty"], + "max_iter": 100, + "alpha": param["alpha"], + "learning_rate": "optimal", + "eta0": param["learning_rate"], + "random_state": 105 + } + + # prepare data + df_guest = pandas.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) + df_host = pandas.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) + df = df_guest.join(df_host, rsuffix="host") + y = df[label_name] + X = df.drop(label_name, axis=1) + + # x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) + x_train, x_test, y_train, y_test = X, X, y, y + + # lm = LogisticRegression(max_iter=20) + lm = SGDClassifier(loss="log", **config_param) + lm_fit = lm.fit(x_train, y_train) + y_pred = lm_fit.predict(x_test) + y_prob = lm_fit.predict_proba(x_test)[:, 1] + try: + auc_score = roc_auc_score(y_test, y_prob) + except BaseException: + print(f"no auc score available") + return + recall = recall_score(y_test, y_pred, average="macro") + pr = precision_score(y_test, y_pred, average="macro") + acc = accuracy_score(y_test, y_pred) + # y_predict_proba = est.predict_proba(X_test)[:, 1] + fpr, tpr, thresholds = roc_curve(y_test, y_prob) + + ks = max(tpr - fpr) + result = {"auc": auc_score, "recall": recall, "precision": pr, "accuracy": acc} + print(result) + print(f"coef_: {lm_fit.coef_}, intercept_: {lm_fit.intercept_}, n_iter: {lm_fit.n_iter_}") + return {}, result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("BENCHMARK-QUALITY SKLEARN JOB") + parser.add_argument("-p", "--param", type=str, default="./breast_config.yaml", + help="config file for params") + args = parser.parse_args() + main(param=args.param) diff --git a/examples/benchmark_quality/sklearn-lr-multi.py b/examples/benchmark_quality/sklearn-lr-multi.py new file mode 100644 index 0000000000..0b33e57c8f --- /dev/null +++ b/examples/benchmark_quality/sklearn-lr-multi.py @@ -0,0 +1,79 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import os + +import pandas +from pipeline.utils.tools import JobConfig +from sklearn.linear_model import SGDClassifier +from sklearn.metrics import precision_score, accuracy_score, recall_score + + +def main(config="../../config.yaml", param="./vechile_config.yaml"): + # obtain config + if isinstance(param, str): + param = JobConfig.load_from_file(param) + assert isinstance(param, dict) + data_guest = param["data_guest"] + data_host = param["data_host"] + + idx = param["idx"] + label_name = param["label_name"] + + if isinstance(config, str): + config = JobConfig.load_from_file(config) + data_base_dir = config["data_base_dir"] + else: + data_base_dir = config.data_base_dir + + config_param = { + "penalty": param["penalty"], + "max_iter": param["max_iter"], + "alpha": param["alpha"], + "learning_rate": "optimal", + "eta0": param["learning_rate"], + "random_state": 105 + } + + # prepare data + df_guest = pandas.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) + df_host = pandas.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) + + df = df_guest.join(df_host, rsuffix="host") + y = df[label_name] + X = df.drop(label_name, axis=1) + # lm = LogisticRegression(max_iter=20) + lm = SGDClassifier(loss="log", **config_param, shuffle=False) + lm_fit = lm.fit(X, y) + y_pred = lm_fit.predict(X) + + recall = recall_score(y, y_pred, average="macro") + pr = precision_score(y, y_pred, average="macro") + acc = accuracy_score(y, y_pred) + + result = {"accuracy": acc} + print(result) + return {}, result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("BENCHMARK-QUALITY SKLEARN JOB") + parser.add_argument("-param", type=str, + help="config file for params") + args = parser.parse_args() + if args.param is not None: + main(args.param) diff --git a/examples/benchmark_quality/vehicle_config.yaml b/examples/benchmark_quality/vehicle_config.yaml new file mode 100644 index 0000000000..29dceb4345 --- /dev/null +++ b/examples/benchmark_quality/vehicle_config.yaml @@ -0,0 +1,12 @@ +data_guest: "examples/data/vehicle_scale_hetero_guest.csv" +data_host: "examples/data/vehicle_scale_hetero_host.csv" +idx: "id" +label_name: "y" +penalty: "L2" +max_iter: 20 +alpha: 0.00001 +learning_rate: 0.3 +optimizer: "adam" +batch_size: 16 +early_stop: "diff" +init_method: "random_uniform" \ No newline at end of file diff --git a/examples/benchmark_quality/vehicle_lr_sklearn_config.yaml b/examples/benchmark_quality/vehicle_lr_sklearn_config.yaml new file mode 100644 index 0000000000..f70fdb409a --- /dev/null +++ b/examples/benchmark_quality/vehicle_lr_sklearn_config.yaml @@ -0,0 +1,12 @@ +data_guest: "examples/data/vehicle_scale_hetero_guest.csv" +data_host: "examples/data/vehicle_scale_hetero_host.csv" +idx: "id" +label_name: "y" +penalty: "L2" +max_iter: 30 +alpha: 0.001 +learning_rate: 0.15 +optimizer: "rmsprop" +batch_size: -1 +early_stop: "diff" +init_method: "zeros" \ No newline at end of file diff --git a/python/fate_test/fate_test/_parser.py b/python/fate_test/fate_test/_parser.py index 9f411c948e..7d2e898382 100644 --- a/python/fate_test/fate_test/_parser.py +++ b/python/fate_test/fate_test/_parser.py @@ -55,9 +55,9 @@ def _chain_hooks(hook_funcs, d): return d -DATA_JSON_HOOK = chain_hook() -CONF_JSON_HOOK = chain_hook() -DSL_JSON_HOOK = chain_hook() +DATA_LOAD_HOOK = chain_hook() +CONF_LOAD_HOOK = chain_hook() +DSL_LOAD_HOOK = chain_hook() class Data(object): @@ -131,13 +131,13 @@ def __init__( @staticmethod def load(path: Path, provider): with path.open("r") as f: - # testsuite_config = json.load(f, object_hook=DATA_JSON_HOOK.hook) + # testsuite_config = json.load(f, object_hook=DATA_LOAD_HOOK.hook) testsuite_config = yaml.safe_load(f) - # testsuite_config = DATA_JSON_HOOK.hook(testsuite_config) + # testsuite_config = DATA_LOAD_HOOK.hook(testsuite_config) dataset = [] for d in testsuite_config.get("data"): - d = DATA_JSON_HOOK.hook(d) + d = DATA_LOAD_HOOK.hook(d) """if "use_local_data" not in d: d.update({"use_local_data": _config.use_local_data})""" dataset.append(Data.load(d, path)) @@ -290,7 +290,7 @@ def load(path: Path): dataset = [] for d in testsuite_config.get("data"): - d = DATA_JSON_HOOK.hook(d) + d = DATA_LOAD_HOOK.hook(d) dataset.append(Data.load(d, path)) pairs = [] @@ -367,16 +367,3 @@ def _hook(d): return d return _hook - - -"""class JsonParamType(click.ParamType): - name = "json_string" - - def convert(self, value, param, ctx): - try: - return json.loads(value) - except ValueError: - self.fail(f"{value} is not a valid json string", param, ctx) - - -JSON_STRING = JsonParamType()""" diff --git a/python/fate_test/fate_test/scripts/_utils.py b/python/fate_test/fate_test/scripts/_utils.py index cd3a04e5a0..8445e55bc1 100644 --- a/python/fate_test/fate_test/scripts/_utils.py +++ b/python/fate_test/fate_test/scripts/_utils.py @@ -10,7 +10,7 @@ from fate_test._config import Config from fate_test._flow_client import DataProgress, UploadDataResponse, QueryJobResponse from fate_test._io import echo, LOGGER, set_logger -from fate_test._parser import Testsuite, BenchmarkSuite, DATA_JSON_HOOK, CONF_JSON_HOOK, DSL_JSON_HOOK +from fate_test._parser import Testsuite, BenchmarkSuite, DATA_LOAD_HOOK, CONF_LOAD_HOOK, DSL_LOAD_HOOK def _big_data_task(includes, guest_data_size, host_data_size, guest_feature_num, host_feature_num, host_data_type, @@ -179,11 +179,11 @@ def _set_namespace(data_namespace_mangling, namespace): if data_namespace_mangling: echo.echo(f"add data_namespace_mangling: _{namespace}") - DATA_JSON_HOOK.add_extend_namespace_hook(namespace) - CONF_JSON_HOOK.add_extend_namespace_hook(namespace) + DATA_LOAD_HOOK.add_extend_namespace_hook(namespace) + CONF_LOAD_HOOK.add_extend_namespace_hook(namespace) def _add_replace_hook(replace): - DATA_JSON_HOOK.add_replace_hook(replace) - CONF_JSON_HOOK.add_replace_hook(replace) - DSL_JSON_HOOK.add_replace_hook(replace) + DATA_LOAD_HOOK.add_replace_hook(replace) + CONF_LOAD_HOOK.add_replace_hook(replace) + DSL_LOAD_HOOK.add_replace_hook(replace) From 36ff38861a35a00d2fff7bcd2eb181a386704385 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Fri, 28 Jul 2023 15:14:01 +0800 Subject: [PATCH 05/30] replace json with yaml(#5008) Signed-off-by: Yu Wu --- python/fate_test/fate_test/utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/python/fate_test/fate_test/utils.py b/python/fate_test/fate_test/utils.py index f33d7af74c..443644b223 100644 --- a/python/fate_test/fate_test/utils.py +++ b/python/fate_test/fate_test/utils.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import json + import math import os @@ -21,6 +21,7 @@ from colorama import init, deinit, Fore, Style from fate_test._io import echo from prettytable import PrettyTable, ORGMODE +from ruamel import yaml SCRIPT_METRICS = "script_metrics" DISTRIBUTION_METRICS = "distribution_metrics" @@ -253,10 +254,10 @@ def class_group(class_dict): return metric history_info_dir = "/".join([os.path.join(os.path.abspath(cache_directory), 'benchmark_history', - "benchmark_quality.json")]) + "benchmark_quality.yaml")]) assert os.path.exists(history_info_dir), f"Please check the {history_info_dir} Is it deleted" with open(history_info_dir, 'r') as f: - benchmark_quality = json.load(f, object_hook=dict) + benchmark_quality = yaml.safe_load(f) regression_metric = {} regression_quality = {} class_quality = {} @@ -299,11 +300,11 @@ def metric_compare(abs_tol, rel_tol, match_details, **metric_results): def _save_quality(storage_tag, cache_directory, **results): - save_dir = "/".join([os.path.join(os.path.abspath(cache_directory), 'benchmark_history', "benchmark_quality.json")]) + save_dir = "/".join([os.path.join(os.path.abspath(cache_directory), 'benchmark_history', "benchmark_quality.yaml")]) os.makedirs(os.path.dirname(save_dir), exist_ok=True) if os.path.exists(save_dir): with open(save_dir, 'r') as f: - benchmark_quality = json.load(f, object_hook=dict) + benchmark_quality = yaml.safe_load(f, object_hook=dict) else: benchmark_quality = {} if storage_tag in benchmark_quality: @@ -311,7 +312,7 @@ def _save_quality(storage_tag, cache_directory, **results): benchmark_quality.update({storage_tag: results}) try: with open(save_dir, 'w') as fp: - json.dump(benchmark_quality, fp, indent=2) + yaml.dump(benchmark_quality, fp) print("Storage success, please check: ", save_dir) except Exception: print("Storage failed, please check: ", save_dir) From 0f61d905c6bd9cf206e53b7776e5088c084db32b Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Mon, 31 Jul 2023 15:42:47 +0800 Subject: [PATCH 06/30] edit examples Signed-off-by: Yu Wu --- examples/benchmark_quality/pipeline-lr-binary.py | 9 +-------- examples/benchmark_quality/pipeline-lr-multi.py | 6 +----- python/fate_test/fate_test/utils.py | 2 +- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/examples/benchmark_quality/pipeline-lr-binary.py b/examples/benchmark_quality/pipeline-lr-binary.py index 7fa1f786ba..8f8e9c897a 100644 --- a/examples/benchmark_quality/pipeline-lr-binary.py +++ b/examples/benchmark_quality/pipeline-lr-binary.py @@ -22,7 +22,6 @@ from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils from fate_test.utils import extract_data, parse_summary_result -from federatedml.evaluation.metrics import classification_metric def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): @@ -109,13 +108,7 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): lr_1_label = extract_data(lr_1_data, "label") lr_0_score_label = extract_data(lr_0_data, "predict_result", keep_id=True) lr_1_score_label = extract_data(lr_1_data, "predict_result", keep_id=True) - result_summary = parse_summary_result(pipeline.get_component("evaluation_0").get_summary()) - metric_lr = { - "score_diversity_ratio": classification_metric.Distribution.compute(lr_0_score_label, lr_1_score_label), - "ks_2samp": classification_metric.KSTest.compute(lr_0_score, lr_1_score), - "mAP_D_value": classification_metric.AveragePrecisionScore().compute(lr_0_score, lr_1_score, lr_0_label, - lr_1_label)} - result_summary["distribution_metrics"] = {"hetero_lr": metric_lr} + result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_metric()) data_summary = {"train": {"guest": guest_train_data["name"], "host": host_train_data["name"]}, "test": {"guest": guest_train_data["name"], "host": host_train_data["name"]} diff --git a/examples/benchmark_quality/pipeline-lr-multi.py b/examples/benchmark_quality/pipeline-lr-multi.py index f774515dc1..384e03bd7c 100644 --- a/examples/benchmark_quality/pipeline-lr-multi.py +++ b/examples/benchmark_quality/pipeline-lr-multi.py @@ -22,7 +22,6 @@ from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils from fate_test.utils import extract_data, parse_summary_result -from federatedml.evaluation.metrics import classification_metric def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): @@ -89,12 +88,9 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): lr_0_data = pipeline.get_component("lr_0").get_output_data() lr_1_data = pipeline.get_component("lr_1").get_output_data() - result_summary = parse_summary_result(pipeline.get_component("evaluation_0").get_summary()) + result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_metric()) lr_0_score_label = extract_data(lr_0_data, "predict_result", keep_id=True) lr_1_score_label = extract_data(lr_1_data, "predict_result", keep_id=True) - metric_lr = { - "score_diversity_ratio": classification_metric.Distribution.compute(lr_0_score_label, lr_1_score_label)} - result_summary["distribution_metrics"] = {"hetero_lr": metric_lr} data_summary = {"train": {"guest": guest_train_data["name"], "host": host_train_data["name"]}, "test": {"guest": guest_train_data["name"], "host": host_train_data["name"]} diff --git a/python/fate_test/fate_test/utils.py b/python/fate_test/fate_test/utils.py index 443644b223..d31b07bfb3 100644 --- a/python/fate_test/fate_test/utils.py +++ b/python/fate_test/fate_test/utils.py @@ -304,7 +304,7 @@ def _save_quality(storage_tag, cache_directory, **results): os.makedirs(os.path.dirname(save_dir), exist_ok=True) if os.path.exists(save_dir): with open(save_dir, 'r') as f: - benchmark_quality = yaml.safe_load(f, object_hook=dict) + benchmark_quality = yaml.safe_load(f) else: benchmark_quality = {} if storage_tag in benchmark_quality: From a685faaa9975ae4929af4b8025286f0df49461e3 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Mon, 31 Jul 2023 17:55:03 +0800 Subject: [PATCH 07/30] edit fate-test bq & examples(#5008) Signed-off-by: Yu Wu --- .../{ => lr}/breast_config.yaml | 5 ++++- .../{ => lr}/default_credit_config.yaml | 5 ++++- .../{ => lr}/epsilon_5k_config.yaml | 5 ++++- .../{ => lr}/give_credit_config.yaml | 5 ++++- .../{ => lr}/lr_benchmark.yaml | 0 .../{ => lr}/pipeline-lr-binary.py | 10 +++++----- .../{ => lr}/pipeline-lr-multi.py | 7 +++---- .../{ => lr}/sklearn-lr-binary.py | 0 .../{ => lr}/sklearn-lr-multi.py | 0 .../benchmark_quality/lr/vehicle_config.yaml | 19 +++++++++++++++++++ .../lr/vehicle_lr_sklearn_config.yaml | 19 +++++++++++++++++++ .../benchmark_quality/vehicle_config.yaml | 12 ------------ .../vehicle_lr_sklearn_config.yaml | 12 ------------ examples/config.yaml | 2 -- examples/pipeline/test_upload_sid.py | 16 ++++++++-------- .../fate_test/fate_test/scripts/_options.py | 6 +++--- .../fate_test/scripts/benchmark_cli.py | 11 +++++++---- .../fate_test/scripts/testsuite_cli.py | 4 ++-- 18 files changed, 82 insertions(+), 56 deletions(-) rename examples/benchmark_quality/{ => lr}/breast_config.yaml (82%) rename examples/benchmark_quality/{ => lr}/default_credit_config.yaml (83%) rename examples/benchmark_quality/{ => lr}/epsilon_5k_config.yaml (82%) rename examples/benchmark_quality/{ => lr}/give_credit_config.yaml (82%) rename examples/benchmark_quality/{ => lr}/lr_benchmark.yaml (100%) rename examples/benchmark_quality/{ => lr}/pipeline-lr-binary.py (95%) rename examples/benchmark_quality/{ => lr}/pipeline-lr-multi.py (95%) rename examples/benchmark_quality/{ => lr}/sklearn-lr-binary.py (100%) rename examples/benchmark_quality/{ => lr}/sklearn-lr-multi.py (100%) create mode 100644 examples/benchmark_quality/lr/vehicle_config.yaml create mode 100644 examples/benchmark_quality/lr/vehicle_lr_sklearn_config.yaml delete mode 100644 examples/benchmark_quality/vehicle_config.yaml delete mode 100644 examples/benchmark_quality/vehicle_lr_sklearn_config.yaml diff --git a/examples/benchmark_quality/breast_config.yaml b/examples/benchmark_quality/lr/breast_config.yaml similarity index 82% rename from examples/benchmark_quality/breast_config.yaml rename to examples/benchmark_quality/lr/breast_config.yaml index 00090b4c16..46a52cd575 100644 --- a/examples/benchmark_quality/breast_config.yaml +++ b/examples/benchmark_quality/lr/breast_config.yaml @@ -2,8 +2,10 @@ data_guest: "examples/data/breast_hetero_guest.csv" data_host: "examples/data/breast_hetero_host.csv" idx: "id" label_name: "y" -penalty: "L2" epochs: 30 +init_param: + fit_intercept: True + method: "zeros" learning_rate_scheduler: method: "constant" scheduler_params: @@ -12,5 +14,6 @@ learning_rate_scheduler: total_iters: 100 optimizer: method: "rmsprop" + penalty: "L2" batch_size: 5000 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/default_credit_config.yaml b/examples/benchmark_quality/lr/default_credit_config.yaml similarity index 83% rename from examples/benchmark_quality/default_credit_config.yaml rename to examples/benchmark_quality/lr/default_credit_config.yaml index f86a48e834..a6c833bc1d 100644 --- a/examples/benchmark_quality/default_credit_config.yaml +++ b/examples/benchmark_quality/lr/default_credit_config.yaml @@ -2,8 +2,10 @@ data_guest: "examples/data/default_credit_hetero_guest.csv" data_host: "examples/data/default_credit_hetero_host.csv" idx: "id" label_name: "y" -penalty: "L2" epochs: 30 +init_param: + fit_intercept: True + method: "zeros" learning_rate_scheduler: method: "constant" scheduler_params: @@ -12,5 +14,6 @@ learning_rate_scheduler: total_iters: 100 optimizer: method: "zeros" + penalty: "L2" batch_size: 500 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/epsilon_5k_config.yaml b/examples/benchmark_quality/lr/epsilon_5k_config.yaml similarity index 82% rename from examples/benchmark_quality/epsilon_5k_config.yaml rename to examples/benchmark_quality/lr/epsilon_5k_config.yaml index be63b9b414..fdc50ec717 100644 --- a/examples/benchmark_quality/epsilon_5k_config.yaml +++ b/examples/benchmark_quality/lr/epsilon_5k_config.yaml @@ -2,8 +2,10 @@ data_guest: "examples/data/epsilon_5k_hetero_guest.csv" data_host: "examples/data/epsilon_5k_hetero_host.csv" idx: "id" label_name: "y" -penalty: "L2" epochs: 30 +init_param: + fit_intercept: True + method: "zeros" learning_rate_scheduler: method: "constant" scheduler_params: @@ -12,5 +14,6 @@ learning_rate_scheduler: total_iters: 800 optimizer: method: "rmsprop" + penalty: "L2" batch_size: 5000 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/give_credit_config.yaml b/examples/benchmark_quality/lr/give_credit_config.yaml similarity index 82% rename from examples/benchmark_quality/give_credit_config.yaml rename to examples/benchmark_quality/lr/give_credit_config.yaml index bc2b6a683f..d1ba9f48f7 100644 --- a/examples/benchmark_quality/give_credit_config.yaml +++ b/examples/benchmark_quality/lr/give_credit_config.yaml @@ -2,8 +2,10 @@ data_guest: "examples/data/give_credit_hetero_guest.csv" data_host: "examples/data/give_credit_hetero_host.csv" idx: "id" label_name: "y" -penalty: "L2" epochs: 6 +init_param: + fit_intercept: True + method: "zeros" learning_rate_scheduler: method: "constant" scheduler_params: @@ -12,5 +14,6 @@ learning_rate_scheduler: total_iters: 100 optimizer: method: "adam" + penalty: "L2" batch_size: 550 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr_benchmark.yaml b/examples/benchmark_quality/lr/lr_benchmark.yaml similarity index 100% rename from examples/benchmark_quality/lr_benchmark.yaml rename to examples/benchmark_quality/lr/lr_benchmark.yaml diff --git a/examples/benchmark_quality/pipeline-lr-binary.py b/examples/benchmark_quality/lr/pipeline-lr-binary.py similarity index 95% rename from examples/benchmark_quality/pipeline-lr-binary.py rename to examples/benchmark_quality/lr/pipeline-lr-binary.py index 8f8e9c897a..4b6c003060 100644 --- a/examples/benchmark_quality/pipeline-lr-binary.py +++ b/examples/benchmark_quality/lr/pipeline-lr-binary.py @@ -68,14 +68,13 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): } config_param = { - "penalty": param["penalty"], "epochs": param["epochs"], "learning_rate_scheduler": param["learning_rate_scheduler"], "optimizer": param["optimizer"], "batch_size": param["batch_size"], - "early_stop": "diff", - "tol": 1e-5, - "init_param": param.get("init_method", {"method": "zeros"}) + "early_stop": param["early_stop"], + "init_param": param["init_param"], + "tol": 1e-5 } lr_param.update(config_param) lr_0 = CoordinatedLR("lr_0", @@ -83,7 +82,7 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): **config_param) lr_1 = CoordinatedLR("lr_1", test_data=intersect_0.outputs["output_data"], - input_model=lr_0.outputs["train_output_model"]) + input_model=lr_0.outputs["output_model"]) evaluation_0 = Evaluation("evaluation_0", label_column_name="y", @@ -109,6 +108,7 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): lr_0_score_label = extract_data(lr_0_data, "predict_result", keep_id=True) lr_1_score_label = extract_data(lr_1_data, "predict_result", keep_id=True) result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_metric()) + print(f"result_summary") data_summary = {"train": {"guest": guest_train_data["name"], "host": host_train_data["name"]}, "test": {"guest": guest_train_data["name"], "host": host_train_data["name"]} diff --git a/examples/benchmark_quality/pipeline-lr-multi.py b/examples/benchmark_quality/lr/pipeline-lr-multi.py similarity index 95% rename from examples/benchmark_quality/pipeline-lr-multi.py rename to examples/benchmark_quality/lr/pipeline-lr-multi.py index 384e03bd7c..5d504c19c3 100644 --- a/examples/benchmark_quality/pipeline-lr-multi.py +++ b/examples/benchmark_quality/lr/pipeline-lr-multi.py @@ -58,14 +58,13 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): } config_param = { - "penalty": param["penalty"], "epochs": param["epochs"], "learning_rate_scheduler": param["learning_rate_scheduler"], "optimizer": param["optimizer"], "batch_size": param["batch_size"], - "early_stop": "diff", + "early_stop": param["early_stop"], + "init_param": param["init_param"], "tol": 1e-5, - "init_param": param.get("init_method", {"method": "zeros"}) } lr_param.update(config_param) lr_0 = CoordinatedLR("lr_0", @@ -73,7 +72,7 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): **config_param) lr_1 = CoordinatedLR("lr_1", test_data=intersect_0.outputs["output_data"], - input_model=lr_0.outputs["train_output_model"]) + input_model=lr_0.outputs["output_model"]) evaluation_0 = Evaluation('evaluation_0', default_eval_setting="multi") pipeline.add_task(intersect_0) diff --git a/examples/benchmark_quality/sklearn-lr-binary.py b/examples/benchmark_quality/lr/sklearn-lr-binary.py similarity index 100% rename from examples/benchmark_quality/sklearn-lr-binary.py rename to examples/benchmark_quality/lr/sklearn-lr-binary.py diff --git a/examples/benchmark_quality/sklearn-lr-multi.py b/examples/benchmark_quality/lr/sklearn-lr-multi.py similarity index 100% rename from examples/benchmark_quality/sklearn-lr-multi.py rename to examples/benchmark_quality/lr/sklearn-lr-multi.py diff --git a/examples/benchmark_quality/lr/vehicle_config.yaml b/examples/benchmark_quality/lr/vehicle_config.yaml new file mode 100644 index 0000000000..9312bf7a07 --- /dev/null +++ b/examples/benchmark_quality/lr/vehicle_config.yaml @@ -0,0 +1,19 @@ +data_guest: "examples/data/vehicle_scale_hetero_guest.csv" +data_host: "examples/data/vehicle_scale_hetero_host.csv" +idx: "id" +label_name: "y" +epochs: 20 +init_param: + fit_intercept: True + method: "zeros" +learning_rate_scheduler: + method: "constant" + scheduler_params: + lr: 0.3 + factor: 1.0 + total_iters: 800 +optimizer: + method: "adam" + penalty: "L2" +batch_size: 16 +early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/vehicle_lr_sklearn_config.yaml b/examples/benchmark_quality/lr/vehicle_lr_sklearn_config.yaml new file mode 100644 index 0000000000..0d9bda1717 --- /dev/null +++ b/examples/benchmark_quality/lr/vehicle_lr_sklearn_config.yaml @@ -0,0 +1,19 @@ +data_guest: "examples/data/vehicle_scale_hetero_guest.csv" +data_host: "examples/data/vehicle_scale_hetero_host.csv" +idx: "id" +label_name: "y" +epochs: 30 +init_param: + fit_intercept: True + method: "zeros" +learning_rate_scheduler: + method: "constant" + scheduler_params: + lr: 0.15 + factor: 1.0 + total_iters: 800' +optimizer: + method: "rmsprop" + penalty: "L2" +batch_size: None +early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/vehicle_config.yaml b/examples/benchmark_quality/vehicle_config.yaml deleted file mode 100644 index 29dceb4345..0000000000 --- a/examples/benchmark_quality/vehicle_config.yaml +++ /dev/null @@ -1,12 +0,0 @@ -data_guest: "examples/data/vehicle_scale_hetero_guest.csv" -data_host: "examples/data/vehicle_scale_hetero_host.csv" -idx: "id" -label_name: "y" -penalty: "L2" -max_iter: 20 -alpha: 0.00001 -learning_rate: 0.3 -optimizer: "adam" -batch_size: 16 -early_stop: "diff" -init_method: "random_uniform" \ No newline at end of file diff --git a/examples/benchmark_quality/vehicle_lr_sklearn_config.yaml b/examples/benchmark_quality/vehicle_lr_sklearn_config.yaml deleted file mode 100644 index f70fdb409a..0000000000 --- a/examples/benchmark_quality/vehicle_lr_sklearn_config.yaml +++ /dev/null @@ -1,12 +0,0 @@ -data_guest: "examples/data/vehicle_scale_hetero_guest.csv" -data_host: "examples/data/vehicle_scale_hetero_host.csv" -idx: "id" -label_name: "y" -penalty: "L2" -max_iter: 30 -alpha: 0.001 -learning_rate: 0.15 -optimizer: "rmsprop" -batch_size: -1 -early_stop: "diff" -init_method: "zeros" \ No newline at end of file diff --git a/examples/config.yaml b/examples/config.yaml index 08d3da7420..2905ff3ec8 100644 --- a/examples/config.yaml +++ b/examples/config.yaml @@ -7,6 +7,4 @@ parties: # parties default id arbiter: - 10000 -work_mode: 0 # 0 for standalone, or 1 for cluster - data_base_dir: "/data/projects/fate" # path to project base where data is located \ No newline at end of file diff --git a/examples/pipeline/test_upload_sid.py b/examples/pipeline/test_upload_sid.py index dfc82d1b18..d65e696fc3 100644 --- a/examples/pipeline/test_upload_sid.py +++ b/examples/pipeline/test_upload_sid.py @@ -31,10 +31,10 @@ 'tag_with_value': False, 'weight_type': 'float64'} -pipeline.transform_local_file_to_dataframe("/Users/yuwu/PycharmProjects/FATE/examples/data/breast_hetero_guest_sid.csv", - meta=meta, head=True, extend_sid=False, - namespace="experiment_sid", - name="breast_hetero_guest") +pipeline.transform_local_file_to_dataframe( # file="${abs_path_of_data_guest}", + meta=meta, head=True, extend_sid=False, + namespace="experiment_sid", + name="breast_hetero_guest") meta = {'delimiter': ',', 'dtype': 'float64', @@ -47,7 +47,7 @@ 'tag_with_value': False, 'weight_type': 'float64'} -pipeline.transform_local_file_to_dataframe("/Users/yuwu/PycharmProjects/FATE/examples/data/breast_hetero_host_sid.csv", - meta=meta, head=True, extend_sid=False, - namespace="experiment_sid", - name="breast_hetero_host") +pipeline.transform_local_file_to_dataframe( # file="${abs_path_of_data_host}", + meta=meta, head=True, extend_sid=False, + namespace="experiment_sid", + name="breast_hetero_host") diff --git a/python/fate_test/fate_test/scripts/_options.py b/python/fate_test/fate_test/scripts/_options.py index ae30f748a0..364c43a264 100644 --- a/python/fate_test/fate_test/scripts/_options.py +++ b/python/fate_test/fate_test/scripts/_options.py @@ -18,9 +18,9 @@ class SharedOptions(object): False), "yes": (('-y', '--yes',), dict(type=bool, is_flag=True, help="Skip double check", default=None), False), - # "extend_sid": (('--extend_sid',), - # dict(type=bool, is_flag=True, help="whether to append uuid as sid when uploading data", - # default=None), None), + "extend_sid": (('--extend_sid',), + dict(type=bool, is_flag=True, help="whether to append uuid as sid when uploading data", + default=None), None), # "auto_increasing_sid": (('--auto_increasing_sid',), # dict(type=bool, is_flag=True, help="whether to generate sid value starting at 0", # default=None), None), diff --git a/python/fate_test/fate_test/scripts/benchmark_cli.py b/python/fate_test/fate_test/scripts/benchmark_cli.py index d9f82d4139..d66a61058d 100644 --- a/python/fate_test/fate_test/scripts/benchmark_cli.py +++ b/python/fate_test/fate_test/scripts/benchmark_cli.py @@ -50,8 +50,9 @@ def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, stora config_inst = ctx.obj["config"] if ctx.obj["extend_sid"] is not None: config_inst.extend_sid = ctx.obj["extend_sid"] - if ctx.obj["auto_increasing_sid"] is not None: - config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"] + + """if ctx.obj["auto_increasing_sid"] is not None: + config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"]""" if clean_data is None: clean_data = config_inst.clean_data data_namespace_mangling = ctx.obj["namespace_mangling"] @@ -61,13 +62,15 @@ def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, stora echo.echo(f"testsuite namespace: {namespace}", fg='red') echo.echo("loading testsuites:") suites = _load_testsuites(includes=include, excludes=exclude, glob=glob, - suffix="benchmark.json", suite_type="benchmark") + suffix="benchmark.yaml", suite_type="benchmark") for suite in suites: echo.echo(f"\tdataset({len(suite.dataset)}) benchmark groups({len(suite.pairs)}) {suite.path}") if not yes and not click.confirm("running?"): return client = Clients(config_inst) - fate_version = client["guest_0"].get_version() + # @todo: get version + # fate_version = client["guest_0"].get_version() + fate_version = "2.0.0-beta" for i, suite in enumerate(suites): # noinspection PyBroadException try: diff --git a/python/fate_test/fate_test/scripts/testsuite_cli.py b/python/fate_test/fate_test/scripts/testsuite_cli.py index f308ad1674..3bc295bfe1 100644 --- a/python/fate_test/fate_test/scripts/testsuite_cli.py +++ b/python/fate_test/fate_test/scripts/testsuite_cli.py @@ -64,9 +64,9 @@ def run_suite(ctx, include, exclude, glob, ctx.obj.update(**kwargs) ctx.obj.post_process() config_inst = ctx.obj["config"] - """if ctx.obj["extend_sid"] is not None: + if ctx.obj["extend_sid"] is not None: config_inst.extend_sid = ctx.obj["extend_sid"] - if ctx.obj["auto_increasing_sid"] is not None: + """if ctx.obj["auto_increasing_sid"] is not None: config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"]""" if clean_data is None: clean_data = config_inst.clean_data From e9a2a7468da3692c248024b720e4fa9432902414 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Tue, 1 Aug 2023 16:21:52 +0800 Subject: [PATCH 08/30] edit fate-test bq & examples(#5008) Signed-off-by: Yu Wu --- examples/benchmark_quality/lr/breast_config.yaml | 4 +++- .../lr/breast_lr_sklearn_config.yaml | 12 ++++++++++++ .../benchmark_quality/lr/default_credit_config.yaml | 3 ++- .../benchmark_quality/lr/epsilon_5k_config.yaml | 3 ++- .../benchmark_quality/lr/give_credit_config.yaml | 3 ++- examples/benchmark_quality/lr/lr_benchmark.yaml | 2 +- examples/benchmark_quality/lr/pipeline-lr-binary.py | 13 ++++++++----- examples/benchmark_quality/lr/pipeline-lr-multi.py | 6 +++--- examples/benchmark_quality/lr/sklearn-lr-binary.py | 8 ++++---- examples/benchmark_quality/lr/sklearn-lr-multi.py | 4 ++-- examples/config.yaml | 2 +- python/fate_test/fate_test/_flow_client.py | 12 ++++++++++++ 12 files changed, 52 insertions(+), 20 deletions(-) create mode 100644 examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml diff --git a/examples/benchmark_quality/lr/breast_config.yaml b/examples/benchmark_quality/lr/breast_config.yaml index 46a52cd575..80bc467254 100644 --- a/examples/benchmark_quality/lr/breast_config.yaml +++ b/examples/benchmark_quality/lr/breast_config.yaml @@ -9,11 +9,13 @@ init_param: learning_rate_scheduler: method: "constant" scheduler_params: - lr: 0.15 factor: 1.0 total_iters: 100 optimizer: method: "rmsprop" penalty: "L2" + optimizer_params: + lr: 0.1 + alpha: 0.5 batch_size: 5000 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml b/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml new file mode 100644 index 0000000000..02483f1f61 --- /dev/null +++ b/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml @@ -0,0 +1,12 @@ +data_guest: "examples/data/breast_hetero_guest.csv" +data_host: "examples/data/breast_hetero_host.csv" +idx: "id" +label_name: "y" +epochs: 30 +fit_intercept: True +method: "rmsprop" +penalty: "L2" +eta0: 0.1 +alpha: 0.5 +batch_size: 5000 +early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/default_credit_config.yaml b/examples/benchmark_quality/lr/default_credit_config.yaml index a6c833bc1d..b143418832 100644 --- a/examples/benchmark_quality/lr/default_credit_config.yaml +++ b/examples/benchmark_quality/lr/default_credit_config.yaml @@ -9,11 +9,12 @@ init_param: learning_rate_scheduler: method: "constant" scheduler_params: - lr: 0.15 factor: 1.0 total_iters: 100 optimizer: method: "zeros" penalty: "L2" + optimizer_params: + lr: 0.15 batch_size: 500 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/epsilon_5k_config.yaml b/examples/benchmark_quality/lr/epsilon_5k_config.yaml index fdc50ec717..232b830d6c 100644 --- a/examples/benchmark_quality/lr/epsilon_5k_config.yaml +++ b/examples/benchmark_quality/lr/epsilon_5k_config.yaml @@ -9,11 +9,12 @@ init_param: learning_rate_scheduler: method: "constant" scheduler_params: - lr: 0.15 factor: 1.0 total_iters: 800 optimizer: method: "rmsprop" penalty: "L2" + optimizer_params: + lr: 0.15 batch_size: 5000 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/give_credit_config.yaml b/examples/benchmark_quality/lr/give_credit_config.yaml index d1ba9f48f7..f6971ec107 100644 --- a/examples/benchmark_quality/lr/give_credit_config.yaml +++ b/examples/benchmark_quality/lr/give_credit_config.yaml @@ -9,11 +9,12 @@ init_param: learning_rate_scheduler: method: "constant" scheduler_params: - lr: 0.15 factor: 1.0 total_iters: 100 optimizer: method: "adam" penalty: "L2" + optimizer_params: + lr: 0.15 batch_size: 550 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/lr_benchmark.yaml b/examples/benchmark_quality/lr/lr_benchmark.yaml index dad81264ed..692e74db2d 100644 --- a/examples/benchmark_quality/lr/lr_benchmark.yaml +++ b/examples/benchmark_quality/lr/lr_benchmark.yaml @@ -118,7 +118,7 @@ data: hetero_lr-binary-0: local: script: "./sklearn-lr-binary.py" - conf: "./breast_config.yaml" + conf: "./breast_lr_sklearn_config.yaml" FATE-hetero-lr: script: "./pipeline-lr-binary.py" conf: "./breast_config.yaml" diff --git a/examples/benchmark_quality/lr/pipeline-lr-binary.py b/examples/benchmark_quality/lr/pipeline-lr-binary.py index 4b6c003060..ee397d1e14 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-binary.py +++ b/examples/benchmark_quality/lr/pipeline-lr-binary.py @@ -99,15 +99,18 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): print(pipeline.get_dag()) pipeline.fit() - lr_0_data = pipeline.get_component("lr_0").get_output_data() - lr_1_data = pipeline.get_component("lr_1").get_output_data() + lr_0_data = pipeline.get_task_info("lr_0").get_output_data()["train_output_data"] + lr_1_data = pipeline.get_task_info("lr_1").get_output_data()["test_output_data"] lr_0_score = extract_data(lr_0_data, "predict_result") - lr_0_label = extract_data(lr_0_data, "label") + lr_0_label = extract_data(lr_0_data, "y") lr_1_score = extract_data(lr_1_data, "predict_result") - lr_1_label = extract_data(lr_1_data, "label") + lr_1_label = extract_data(lr_1_data, "y") lr_0_score_label = extract_data(lr_0_data, "predict_result", keep_id=True) lr_1_score_label = extract_data(lr_1_data, "predict_result", keep_id=True) - result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_metric()) + """print(f"evaluation result: {pipeline.get_task_info('evaluation_0').get_output_metric()};" + f"result type: {type(pipeline.get_task_info('evaluation_0').get_output_metric())}") + """ + result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_output_metric()) print(f"result_summary") data_summary = {"train": {"guest": guest_train_data["name"], "host": host_train_data["name"]}, diff --git a/examples/benchmark_quality/lr/pipeline-lr-multi.py b/examples/benchmark_quality/lr/pipeline-lr-multi.py index 5d504c19c3..cc5e9602de 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-multi.py +++ b/examples/benchmark_quality/lr/pipeline-lr-multi.py @@ -84,10 +84,10 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): print(pipeline.get_dag()) pipeline.fit() - lr_0_data = pipeline.get_component("lr_0").get_output_data() - lr_1_data = pipeline.get_component("lr_1").get_output_data() + lr_0_data = pipeline.get_component("lr_0").get_output_data()["train_output_data"] + lr_1_data = pipeline.get_component("lr_1").get_output_data()["test_output_data"] - result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_metric()) + result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_output_metric()) lr_0_score_label = extract_data(lr_0_data, "predict_result", keep_id=True) lr_1_score_label = extract_data(lr_1_data, "predict_result", keep_id=True) diff --git a/examples/benchmark_quality/lr/sklearn-lr-binary.py b/examples/benchmark_quality/lr/sklearn-lr-binary.py index 5b17692621..ffdaf2b945 100644 --- a/examples/benchmark_quality/lr/sklearn-lr-binary.py +++ b/examples/benchmark_quality/lr/sklearn-lr-binary.py @@ -18,12 +18,12 @@ import os import pandas -from pipeline.utils.tools import JobConfig +from fate_client.pipeline.utils.test_utils import JobConfig from sklearn.linear_model import SGDClassifier from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, recall_score, roc_curve -def main(config="../../config.yaml", param="./vechile_config.yaml"): +def main(config="../../config.yaml", param="./breast_lr_sklearn_config.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) @@ -45,7 +45,7 @@ def main(config="../../config.yaml", param="./vechile_config.yaml"): "max_iter": 100, "alpha": param["alpha"], "learning_rate": "optimal", - "eta0": param["learning_rate"], + "eta0": param["eta0"], "random_state": 105 } @@ -84,7 +84,7 @@ def main(config="../../config.yaml", param="./vechile_config.yaml"): if __name__ == "__main__": parser = argparse.ArgumentParser("BENCHMARK-QUALITY SKLEARN JOB") - parser.add_argument("-p", "--param", type=str, default="./breast_config.yaml", + parser.add_argument("-p", "--param", type=str, default="./breast_lr_sklearn_config.yaml", help="config file for params") args = parser.parse_args() main(param=args.param) diff --git a/examples/benchmark_quality/lr/sklearn-lr-multi.py b/examples/benchmark_quality/lr/sklearn-lr-multi.py index 0b33e57c8f..fb8da4827f 100644 --- a/examples/benchmark_quality/lr/sklearn-lr-multi.py +++ b/examples/benchmark_quality/lr/sklearn-lr-multi.py @@ -18,12 +18,12 @@ import os import pandas -from pipeline.utils.tools import JobConfig +from fate_client.pipeline.utils.test_utils import JobConfig from sklearn.linear_model import SGDClassifier from sklearn.metrics import precision_score, accuracy_score, recall_score -def main(config="../../config.yaml", param="./vechile_config.yaml"): +def main(config="../../config.yaml", param="./vehicle_config.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) diff --git a/examples/config.yaml b/examples/config.yaml index 2905ff3ec8..2eec4548d5 100644 --- a/examples/config.yaml +++ b/examples/config.yaml @@ -7,4 +7,4 @@ parties: # parties default id arbiter: - 10000 -data_base_dir: "/data/projects/fate" # path to project base where data is located \ No newline at end of file +data_base_dir: "/Users/yuwu/PycharmProjects/FATE/" # path to project base where data is located \ No newline at end of file diff --git a/python/fate_test/fate_test/_flow_client.py b/python/fate_test/fate_test/_flow_client.py index 2d0d3f8d98..e27c2098e0 100644 --- a/python/fate_test/fate_test/_flow_client.py +++ b/python/fate_test/fate_test/_flow_client.py @@ -269,6 +269,18 @@ def _query_job(self, job_id, role, party_id): raise RuntimeError(f"get version error: {response}") from e return fate_version""" + def get_version(self): + response = self._client.provider.query(name="fate") + try: + retcode = response['code'] + retmsg = response['message'] + if retcode != 0 or retmsg != 'success': + raise RuntimeError(f"get version error: {response}") + fate_version = response["data"]["provider_name"] + except Exception as e: + raise RuntimeError(f"get version error: {response}") from e + return fate_version + def _add_notes(self, job_id, role, party_id, notes): data = dict(job_id=job_id, role=role, party_id=party_id, notes=notes) response = AddNotesResponse(self._post(url='job/update', json=data)) From 444a3f56be23edbb645ad9a52d0e022925e96d84 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Tue, 1 Aug 2023 16:23:01 +0800 Subject: [PATCH 09/30] edit fate-test bq & examples(#5008) Signed-off-by: Yu Wu --- python/fate_test/fate_test/scripts/benchmark_cli.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/fate_test/fate_test/scripts/benchmark_cli.py b/python/fate_test/fate_test/scripts/benchmark_cli.py index d66a61058d..79899a8ff8 100644 --- a/python/fate_test/fate_test/scripts/benchmark_cli.py +++ b/python/fate_test/fate_test/scripts/benchmark_cli.py @@ -68,9 +68,7 @@ def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, stora if not yes and not click.confirm("running?"): return client = Clients(config_inst) - # @todo: get version - # fate_version = client["guest_0"].get_version() - fate_version = "2.0.0-beta" + fate_version = client["guest_0"].get_version() for i, suite in enumerate(suites): # noinspection PyBroadException try: From 26da6fbe83ce55877f6037b0946c4aed7bba5f85 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Tue, 1 Aug 2023 16:24:01 +0800 Subject: [PATCH 10/30] edit fate-test bq examples(#5008) Signed-off-by: Yu Wu --- examples/benchmark_quality/lr/vehicle_config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/benchmark_quality/lr/vehicle_config.yaml b/examples/benchmark_quality/lr/vehicle_config.yaml index 9312bf7a07..2cff7f33b5 100644 --- a/examples/benchmark_quality/lr/vehicle_config.yaml +++ b/examples/benchmark_quality/lr/vehicle_config.yaml @@ -9,11 +9,12 @@ init_param: learning_rate_scheduler: method: "constant" scheduler_params: - lr: 0.3 factor: 1.0 total_iters: 800 optimizer: method: "adam" penalty: "L2" + optimizer_params: + lr: 0.3 batch_size: 16 early_stop: "diff" \ No newline at end of file From 1f40c89655be0e3a39f286f175d85c91a7e3559f Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Tue, 1 Aug 2023 19:51:30 +0800 Subject: [PATCH 11/30] edit fate-test cli & examples(#5008) Signed-off-by: Yu Wu --- .../lr/breast_lr_sklearn_config.yaml | 3 +- .../lr/default_credit_lr_config.yaml | 11 ++++++++ .../benchmark_quality/lr/lr_benchmark.yaml | 2 +- .../lr/pipeline-lr-binary.py | 2 +- .../fate_test/scripts/benchmark_cli.py | 4 ++- python/fate_test/fate_test/utils.py | 28 +++++++++++++++++-- 6 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 examples/benchmark_quality/lr/default_credit_lr_config.yaml diff --git a/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml b/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml index 02483f1f61..2993795c78 100644 --- a/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml +++ b/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml @@ -8,5 +8,4 @@ method: "rmsprop" penalty: "L2" eta0: 0.1 alpha: 0.5 -batch_size: 5000 -early_stop: "diff" \ No newline at end of file +batch_size: 5000 \ No newline at end of file diff --git a/examples/benchmark_quality/lr/default_credit_lr_config.yaml b/examples/benchmark_quality/lr/default_credit_lr_config.yaml new file mode 100644 index 0000000000..e1dd4f6932 --- /dev/null +++ b/examples/benchmark_quality/lr/default_credit_lr_config.yaml @@ -0,0 +1,11 @@ +data_guest: "examples/data/default_credit_hetero_guest.csv" +data_host: "examples/data/default_credit_hetero_host.csv" +idx: "id" +label_name: "y" +epochs: 30 +fit_intercept: True +method: "rmsprop" +penalty: "L2" +eta0: 0.1 +alpha: 0.5 +batch_size: 5000 \ No newline at end of file diff --git a/examples/benchmark_quality/lr/lr_benchmark.yaml b/examples/benchmark_quality/lr/lr_benchmark.yaml index 692e74db2d..18cf77b34f 100644 --- a/examples/benchmark_quality/lr/lr_benchmark.yaml +++ b/examples/benchmark_quality/lr/lr_benchmark.yaml @@ -130,7 +130,7 @@ hetero_lr-binary-1: conf: "./default_credit_config.yaml" FATE-hetero-lr: script: "./pipeline-lr-binary.py" - conf: "./default_credit_config.yaml" + conf: "./default_credit_sklearn_config.yaml" compare_setting: relative_tol: 0.01 hetero_lr-binary-2: diff --git a/examples/benchmark_quality/lr/pipeline-lr-binary.py b/examples/benchmark_quality/lr/pipeline-lr-binary.py index ee397d1e14..ed53c4091a 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-binary.py +++ b/examples/benchmark_quality/lr/pipeline-lr-binary.py @@ -110,7 +110,7 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): """print(f"evaluation result: {pipeline.get_task_info('evaluation_0').get_output_metric()};" f"result type: {type(pipeline.get_task_info('evaluation_0').get_output_metric())}") """ - result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_output_metric()) + result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_output_metric()[0]["data"]) print(f"result_summary") data_summary = {"train": {"guest": guest_train_data["name"], "host": host_train_data["name"]}, diff --git a/python/fate_test/fate_test/scripts/benchmark_cli.py b/python/fate_test/fate_test/scripts/benchmark_cli.py index 79899a8ff8..a6ba20383f 100644 --- a/python/fate_test/fate_test/scripts/benchmark_cli.py +++ b/python/fate_test/fate_test/scripts/benchmark_cli.py @@ -68,7 +68,9 @@ def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, stora if not yes and not click.confirm("running?"): return client = Clients(config_inst) - fate_version = client["guest_0"].get_version() + # @todo: change to client query result + # fate_version = client["guest_0"].get_version() + fate_version = "beta-2.0.0" for i, suite in enumerate(suites): # noinspection PyBroadException try: diff --git a/python/fate_test/fate_test/utils.py b/python/fate_test/fate_test/utils.py index d31b07bfb3..74775354ee 100644 --- a/python/fate_test/fate_test/utils.py +++ b/python/fate_test/fate_test/utils.py @@ -321,10 +321,32 @@ def _save_quality(storage_tag, cache_directory, **results): def parse_summary_result(rs_dict): for model_key in rs_dict: rs_content = rs_dict[model_key] - if 'validate' in rs_content: - return rs_content['validate'] + if 'test_set' in rs_content: + metric_result = rs_content['test_set'] + if 'validate_set' in rs_content: + metric_result = rs_content['validate_set'] else: - return rs_content['train'] + metric_result = rs_content['train_set'] + return extract_and_flatten_summary_metric(metric_result) + + +def extract_and_flatten_summary_metric(metric_dict_list): + flatten_metric_summary = {} + for metric_group in metric_dict_list: + if isinstance(metric_group, dict): + metric_name = metric_group['metric'] + metric_val = metric_group['val'] + if isinstance(metric_val, float) or isinstance(metric_val, int): + flatten_metric_summary[metric_name] = metric_val + elif isinstance(metric_group, list): + for metric_subset in metric_group: + metric_name = metric_subset['metric'] + metric_val = metric_subset['val'] + if isinstance(metric_val, float) or isinstance(metric_val, int): + flatten_metric_summary[metric_name] = metric_val + else: + raise ValueError(f"Invalid metric group: {metric_group}") + return flatten_metric_summary def extract_data(df, col_name, convert_float=True, keep_id=False): From 9737fec22abefedac7bb62eee10895a4ee302e71 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Wed, 2 Aug 2023 10:03:45 +0800 Subject: [PATCH 12/30] edit fate-test examples(#5008) Signed-off-by: Yu Wu --- examples/benchmark_quality/lr/pipeline-lr-binary.py | 2 +- examples/benchmark_quality/lr/sklearn-lr-binary.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/benchmark_quality/lr/pipeline-lr-binary.py b/examples/benchmark_quality/lr/pipeline-lr-binary.py index ed53c4091a..bd845bc9d9 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-binary.py +++ b/examples/benchmark_quality/lr/pipeline-lr-binary.py @@ -87,7 +87,7 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): evaluation_0 = Evaluation("evaluation_0", label_column_name="y", runtime_roles=["guest"], - default_eval_setting="binary", + metrics=["auc", "binary_precision", "binary_accuracy", "binary_recall"], input_data=lr_0.outputs["train_output_data"]) pipeline.add_task(intersect_0) diff --git a/examples/benchmark_quality/lr/sklearn-lr-binary.py b/examples/benchmark_quality/lr/sklearn-lr-binary.py index ffdaf2b945..603c39d75f 100644 --- a/examples/benchmark_quality/lr/sklearn-lr-binary.py +++ b/examples/benchmark_quality/lr/sklearn-lr-binary.py @@ -76,7 +76,7 @@ def main(config="../../config.yaml", param="./breast_lr_sklearn_config.yaml"): fpr, tpr, thresholds = roc_curve(y_test, y_prob) ks = max(tpr - fpr) - result = {"auc": auc_score, "recall": recall, "precision": pr, "accuracy": acc} + result = {"auc": auc_score, "binary_recall": recall, "binary_precision": pr, "binary_accuracy": acc} print(result) print(f"coef_: {lm_fit.coef_}, intercept_: {lm_fit.intercept_}, n_iter: {lm_fit.n_iter_}") return {}, result From bb9f2b8ba67332ee14ff4bf4869639a7b59ba427 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Fri, 4 Aug 2023 19:17:59 +0800 Subject: [PATCH 13/30] add performance cli (#5008) add timeout & task-cores param to bq & testsuite cli(#5008) add examples(#5008) Signed-off-by: Yu Wu --- .../coordinated_lr/breast_config.yaml | 24 ++ .../coordinated_lr/config.yaml | 11 + .../coordinated_lr_performance.yaml | 39 +++ .../coordinated_lr/test_lr_sid.py | 116 +++++++ examples/benchmark_quality/linr/fate-linr.py | 115 +++++++ .../linr/hetero_linr_benchmark.yaml | 47 +++ .../benchmark_quality/linr/linr_config.yaml | 22 ++ .../linr/linr_sklearn_config.yaml | 11 + .../benchmark_quality/lr/breast_config.yaml | 17 +- .../lr/default_credit_config.yaml | 12 +- ... => default_credit_lr_sklearn_config.yaml} | 0 .../lr/epsilon_5k_config.yaml | 16 +- .../lr/epsilon_5k_lr_sklearn_config.yaml | 11 + .../lr/give_credit_config.yaml | 4 +- .../lr/give_credit_lr_sklearn_config.yaml | 11 + .../benchmark_quality/lr/lr_benchmark.yaml | 184 ++++++---- .../lr/pipeline-lr-binary.py | 31 +- .../benchmark_quality/lr/pipeline-lr-multi.py | 17 +- .../benchmark_quality/lr/sklearn-lr-binary.py | 8 +- .../benchmark_quality/lr/sklearn-lr-multi.py | 15 +- .../benchmark_quality/lr/vehicle_config.yaml | 11 +- .../lr/vehicle_lr_sklearn_config.yaml | 20 +- .../pipeline/coordinated_lr/test_lr_sid.py | 4 + .../pipeline/coordinated_lr/test_lr_sid_cv.py | 4 + .../coordinated_lr/test_lr_sid_warm_start.py | 4 + examples/pipeline/test_linr_sid_warm_start.py | 64 ++-- python/fate_test/fate_test/_config.py | 6 + python/fate_test/fate_test/_flow_client.py | 4 + python/fate_test/fate_test/_parser.py | 42 ++- python/fate_test/fate_test/scripts/_utils.py | 4 +- .../fate_test/scripts/benchmark_cli.py | 17 +- .../fate_test/scripts/performance_cli.py | 315 ++++++------------ .../fate_test/scripts/testsuite_cli.py | 14 +- python/fate_test/fate_test/utils.py | 27 ++ 34 files changed, 857 insertions(+), 390 deletions(-) create mode 100644 examples/benchmark_performance/coordinated_lr/breast_config.yaml create mode 100644 examples/benchmark_performance/coordinated_lr/config.yaml create mode 100644 examples/benchmark_performance/coordinated_lr/coordinated_lr_performance.yaml create mode 100644 examples/benchmark_performance/coordinated_lr/test_lr_sid.py create mode 100644 examples/benchmark_quality/linr/fate-linr.py create mode 100644 examples/benchmark_quality/linr/hetero_linr_benchmark.yaml create mode 100644 examples/benchmark_quality/linr/linr_config.yaml create mode 100644 examples/benchmark_quality/linr/linr_sklearn_config.yaml rename examples/benchmark_quality/lr/{default_credit_lr_config.yaml => default_credit_lr_sklearn_config.yaml} (100%) create mode 100644 examples/benchmark_quality/lr/epsilon_5k_lr_sklearn_config.yaml create mode 100644 examples/benchmark_quality/lr/give_credit_lr_sklearn_config.yaml diff --git a/examples/benchmark_performance/coordinated_lr/breast_config.yaml b/examples/benchmark_performance/coordinated_lr/breast_config.yaml new file mode 100644 index 0000000000..d827c47236 --- /dev/null +++ b/examples/benchmark_performance/coordinated_lr/breast_config.yaml @@ -0,0 +1,24 @@ +data_guest: "breast_hetero_guest" +data_host: "breast_hetero_host" +idx: "id" +label_name: "y" +epochs: 20 +init_param: + fit_intercept: True + method: "random_uniform" + random_state: 42 +learning_rate_scheduler: + method: "constant" + scheduler_params: + factor: 1.0 + total_iters: 100 +optimizer: + method: "rmsprop" + penalty: "L2" + optimizer_params: + lr: 0.05 + alpha: 0.1 +batch_size: null +early_stop: "diff" +task_cores: 4 +timeout: 3600 \ No newline at end of file diff --git a/examples/benchmark_performance/coordinated_lr/config.yaml b/examples/benchmark_performance/coordinated_lr/config.yaml new file mode 100644 index 0000000000..1c021a7223 --- /dev/null +++ b/examples/benchmark_performance/coordinated_lr/config.yaml @@ -0,0 +1,11 @@ +parties: # parties default id + guest: + - 9999 + host: + - 9998 + - 9999 + arbiter: + - 9998 + +data_base_dir: "" # path to project base where data is located +timeout: 3600 \ No newline at end of file diff --git a/examples/benchmark_performance/coordinated_lr/coordinated_lr_performance.yaml b/examples/benchmark_performance/coordinated_lr/coordinated_lr_performance.yaml new file mode 100644 index 0000000000..81afb73e56 --- /dev/null +++ b/examples/benchmark_performance/coordinated_lr/coordinated_lr_performance.yaml @@ -0,0 +1,39 @@ +data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host + namespace: experiment + role: host_0 +tasks: + normal-lr: + script: test_lr_sid.py + conf: "./breast_config.yaml" diff --git a/examples/benchmark_performance/coordinated_lr/test_lr_sid.py b/examples/benchmark_performance/coordinated_lr/test_lr_sid.py new file mode 100644 index 0000000000..ebe2b289e0 --- /dev/null +++ b/examples/benchmark_performance/coordinated_lr/test_lr_sid.py @@ -0,0 +1,116 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): + # obtain config + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + + if isinstance(param, str): + param = test_utils.JobConfig.load_from_file(param) + + assert isinstance(param, dict) + + data_set = param.get("data_guest").split('/')[-1] + if data_set == "default_credit_hetero_guest.csv": + guest_data_table = 'default_credit_hetero_guest' + host_data_table = 'default_credit_hetero_host' + elif data_set == 'breast_hetero_guest.csv': + guest_data_table = 'breast_hetero_guest' + host_data_table = 'breast_hetero_host' + elif data_set == 'give_credit_hetero_guest.csv': + guest_data_table = 'give_credit_hetero_guest' + host_data_table = 'give_credit_hetero_host' + elif data_set == 'epsilon_5k_hetero_guest.csv': + guest_data_table = 'epsilon_5k_hetero_guest' + host_data_table = 'epsilon_5k_hetero_host' + else: + raise ValueError(f"Cannot recognized data_set: {data_set}") + + guest_train_data = {"name": guest_data_table, "namespace": f"experiment{namespace}"} + host_train_data = {"name": host_data_table, "namespace": f"experiment{namespace}"} + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + intersect_0 = Intersection("intersect_0", method="raw") + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], + namespace=guest_train_data["namespace"])) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], + namespace=host_train_data["namespace"])) + + lr_param = { + } + + config_param = { + "epochs": param["epochs"], + "learning_rate_scheduler": param["learning_rate_scheduler"], + "optimizer": param["optimizer"], + "batch_size": param["batch_size"], + "early_stop": param["early_stop"], + "init_param": param["init_param"], + "tol": 1e-5 + } + lr_param.update(config_param) + lr_0 = CoordinatedLR("lr_0", + train_data=intersect_0.outputs["output_data"], + **lr_param) + lr_1 = CoordinatedLR("lr_1", + test_data=intersect_0.outputs["output_data"], + input_model=lr_0.outputs["output_model"]) + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="y", + runtime_roles=["guest"], + metrics=["auc", "binary_precision", "binary_accuracy", "binary_recall"], + input_data=lr_0.outputs["train_output_data"]) + + pipeline.add_task(intersect_0) + pipeline.add_task(lr_0) + pipeline.add_task(lr_1) + pipeline.add_task(evaluation_0) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + job_id = pipeline.model_info.job_id + return job_id + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("BENCHMARK-QUALITY PIPELINE JOB") + parser.add_argument("-c", "--config", type=str, + help="config file", default="../../config.yaml") + parser.add_argument("-p", "--param", type=str, + help="config file for params", default="./breast_config.yaml") + args = parser.parse_args() + main(args.config, args.param) diff --git a/examples/benchmark_quality/linr/fate-linr.py b/examples/benchmark_quality/linr/fate-linr.py new file mode 100644 index 0000000000..cb7866ad8d --- /dev/null +++ b/examples/benchmark_quality/linr/fate-linr.py @@ -0,0 +1,115 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLinR, Intersection +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils +from fate_test.utils import parse_summary_result + + +def main(config="../../config.yaml", param="./linr_config.yaml", namespace=""): + # obtain config + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + + if isinstance(param, str): + param = test_utils.JobConfig.load_from_file(param) + + assert isinstance(param, dict) + + guest_train_data = {"name": "motor_hetero_guest", "namespace": f"experiment{namespace}"} + host_train_data = {"name": "motor_hetero_host", "namespace": f"experiment{namespace}"} + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + + intersect_0 = Intersection("intersect_0", method="raw") + intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], + namespace=guest_train_data["namespace"])) + intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], + namespace=host_train_data["namespace"])) + + linr_param = { + } + + config_param = { + "epochs": param["epochs"], + "learning_rate_scheduler": param["learning_rate_scheduler"], + "optimizer": param["optimizer"], + "batch_size": param["batch_size"], + "early_stop": param["early_stop"], + "init_param": param["init_param"], + "tol": 1e-5 + } + linr_param.update(config_param) + linr_0 = CoordinatedLinR("linr_0", + train_data=intersect_0.outputs["output_data"], + **config_param) + """linr_1 = CoordinatedLinR("linr_1", + test_data=intersect_0.outputs["output_data"], + input_model=linr_0.outputs["output_model"])""" + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="motor_speed", + runtime_roles=["guest"], + metrics=["r2_score", + "mse", + "rmse"], + input_data=linr_0.outputs["train_output_data"]) + + pipeline.add_task(intersect_0) + pipeline.add_task(linr_0) + # pipeline.add_task(linr_1) + pipeline.add_task(evaluation_0) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + """linr_0_data = pipeline.get_task_info("linr_0").get_output_data()["train_output_data"] + linr_1_data = pipeline.get_task_info("linr_1").get_output_data()["test_output_data"] + linr_0_score = extract_data(linr_0_data, "predict_result") + linr_0_label = extract_data(linr_0_data, "motor_speed") + linr_1_score = extract_data(linr_1_data, "predict_result") + linr_1_label = extract_data(linr_1_data, "motor_speed") + linr_0_score_label = extract_data(linr_0_data, "predict_result", keep_id=True) + linr_1_score_label = extract_data(linr_1_data, "predict_result", keep_id=True)""" + + result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_output_metric()[0]["data"]) + print(f"result_summary") + + data_summary = {"train": {"guest": guest_train_data["name"], "host": host_train_data["name"]}, + "test": {"guest": guest_train_data["name"], "host": host_train_data["name"]} + } + + return data_summary, result_summary + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("BENCHMARK-QUALITY PIPELINE JOB") + parser.add_argument("-c", "--config", type=str, + help="config file", default="../../config.yaml") + parser.add_argument("-p", "--param", type=str, + help="config file for params", default="./breast_config.yaml") + args = parser.parse_args() + main(args.config, args.param) diff --git a/examples/benchmark_quality/linr/hetero_linr_benchmark.yaml b/examples/benchmark_quality/linr/hetero_linr_benchmark.yaml new file mode 100644 index 0000000000..6d106aeb85 --- /dev/null +++ b/examples/benchmark_quality/linr/hetero_linr_benchmark.yaml @@ -0,0 +1,47 @@ +data: + - file: examples/data/motor_hetero_guest_sid.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: float64 + label_name: motor_speed + match_id_name: "idx" + match_id_range: 0 + sample_id_name: "sid" + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: false + table_name: motor_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/motor_hetero_host_sid.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: "idx" + match_id_range: 0 + sample_id_name: "sid" + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: false + table_name: motor_hetero_host + namespace: experiment + role: host_0 + +hetero_linr: + local: + script: "./local-linr.py" + conf: "./linr_sklearn_config.yaml" + FATE-hetero-linr: + script: "./fate-linr.py" + conf: "./linr_config.yaml" + compare_setting: + relative_tol: 0.01 diff --git a/examples/benchmark_quality/linr/linr_config.yaml b/examples/benchmark_quality/linr/linr_config.yaml new file mode 100644 index 0000000000..13f5199e90 --- /dev/null +++ b/examples/benchmark_quality/linr/linr_config.yaml @@ -0,0 +1,22 @@ +data_guest: "examples/data/motor_hetero_guest.csv" +data_host: "examples/data/motor_hetero_host.csv" +label_name: "motor_speed" +penalty: "L2" +epochs: 10 +init_param: + fit_intercept: True + method: "zeros" + random_state: 42 +learning_rate_scheduler: + method: "constant" + scheduler_params: + factor: 1.0 + total_iters: 100 +optimizer: + method: "sgd" + penalty: "L2" + optimizer_params: + lr: 0.13 + alpha: 0.01 +batch_size: 100 +early_stop: "diff" diff --git a/examples/benchmark_quality/linr/linr_sklearn_config.yaml b/examples/benchmark_quality/linr/linr_sklearn_config.yaml new file mode 100644 index 0000000000..38a15edc00 --- /dev/null +++ b/examples/benchmark_quality/linr/linr_sklearn_config.yaml @@ -0,0 +1,11 @@ +data_guest: "examples/data/motor_hetero_guest.csv" +data_host: "examples/data/motor_hetero_host.csv" +label_name: "motor_speed" +penalty: "L2" +idx: "idx" +epochs: 20 +fit_intercept: True +method: "rmsprop" +eta0: 0.1 +alpha: 0.5 +batch_size: 5000 diff --git a/examples/benchmark_quality/lr/breast_config.yaml b/examples/benchmark_quality/lr/breast_config.yaml index 80bc467254..142f056628 100644 --- a/examples/benchmark_quality/lr/breast_config.yaml +++ b/examples/benchmark_quality/lr/breast_config.yaml @@ -1,11 +1,12 @@ -data_guest: "examples/data/breast_hetero_guest.csv" -data_host: "examples/data/breast_hetero_host.csv" +data_guest: "breast_hetero_guest" +data_host: "breast_hetero_host" idx: "id" label_name: "y" -epochs: 30 +epochs: 15 init_param: fit_intercept: True - method: "zeros" + method: "uniform" + random_state: 42 learning_rate_scheduler: method: "constant" scheduler_params: @@ -13,9 +14,9 @@ learning_rate_scheduler: total_iters: 100 optimizer: method: "rmsprop" - penalty: "L2" + penalty: "l1" optimizer_params: - lr: 0.1 - alpha: 0.5 -batch_size: 5000 + lr: 0.5 + alpha: 0.1 +batch_size: null early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/default_credit_config.yaml b/examples/benchmark_quality/lr/default_credit_config.yaml index b143418832..c45ef53d8a 100644 --- a/examples/benchmark_quality/lr/default_credit_config.yaml +++ b/examples/benchmark_quality/lr/default_credit_config.yaml @@ -1,20 +1,22 @@ -data_guest: "examples/data/default_credit_hetero_guest.csv" -data_host: "examples/data/default_credit_hetero_host.csv" +data_guest: "default_credit_hetero_guest" +data_host: "default_credit_hetero_host" idx: "id" label_name: "y" epochs: 30 init_param: fit_intercept: True method: "zeros" + random_state: 42 learning_rate_scheduler: method: "constant" scheduler_params: factor: 1.0 - total_iters: 100 + total_iters: 10000 optimizer: - method: "zeros" + method: "rmsprop" penalty: "L2" + alpha: 0.001 optimizer_params: lr: 0.15 -batch_size: 500 +batch_size: 3200 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/default_credit_lr_config.yaml b/examples/benchmark_quality/lr/default_credit_lr_sklearn_config.yaml similarity index 100% rename from examples/benchmark_quality/lr/default_credit_lr_config.yaml rename to examples/benchmark_quality/lr/default_credit_lr_sklearn_config.yaml diff --git a/examples/benchmark_quality/lr/epsilon_5k_config.yaml b/examples/benchmark_quality/lr/epsilon_5k_config.yaml index 232b830d6c..6822e02ea7 100644 --- a/examples/benchmark_quality/lr/epsilon_5k_config.yaml +++ b/examples/benchmark_quality/lr/epsilon_5k_config.yaml @@ -1,20 +1,22 @@ -data_guest: "examples/data/epsilon_5k_hetero_guest.csv" -data_host: "examples/data/epsilon_5k_hetero_host.csv" +data_guest: "epsilon_5k_hetero_guest" +data_host: "epsilon_5k_hetero_host" idx: "id" label_name: "y" epochs: 30 +batch_size: 2500 init_param: fit_intercept: True - method: "zeros" + method: "random" + random_state: 42 learning_rate_scheduler: method: "constant" scheduler_params: factor: 1.0 - total_iters: 800 + total_iters: 1000 optimizer: - method: "rmsprop" + method: "adam" penalty: "L2" + alpha: 0.0001 optimizer_params: - lr: 0.15 -batch_size: 5000 + lr: 0.3 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/epsilon_5k_lr_sklearn_config.yaml b/examples/benchmark_quality/lr/epsilon_5k_lr_sklearn_config.yaml new file mode 100644 index 0000000000..bef62e89aa --- /dev/null +++ b/examples/benchmark_quality/lr/epsilon_5k_lr_sklearn_config.yaml @@ -0,0 +1,11 @@ +data_guest: "examples/data/epsilon_5k_hetero_guest.csv" +data_host: "examples/data/epsilon_5k_hetero_host.csv" +idx: "id" +label_name: "y" +epochs: 30 +fit_intercept: True +method: "rmsprop" +penalty: "L2" +eta0: 0.1 +alpha: 0.5 +batch_size: 5000 \ No newline at end of file diff --git a/examples/benchmark_quality/lr/give_credit_config.yaml b/examples/benchmark_quality/lr/give_credit_config.yaml index f6971ec107..73f2285fa1 100644 --- a/examples/benchmark_quality/lr/give_credit_config.yaml +++ b/examples/benchmark_quality/lr/give_credit_config.yaml @@ -1,5 +1,5 @@ -data_guest: "examples/data/give_credit_hetero_guest.csv" -data_host: "examples/data/give_credit_hetero_host.csv" +data_guest: "give_credit_hetero_guest" +data_host: "give_credit_hetero_host" idx: "id" label_name: "y" epochs: 6 diff --git a/examples/benchmark_quality/lr/give_credit_lr_sklearn_config.yaml b/examples/benchmark_quality/lr/give_credit_lr_sklearn_config.yaml new file mode 100644 index 0000000000..4dcb136b99 --- /dev/null +++ b/examples/benchmark_quality/lr/give_credit_lr_sklearn_config.yaml @@ -0,0 +1,11 @@ +data_guest: "examples/data/give_credit_hetero_guest.csv" +data_host: "examples/data/give_credit_hetero_host.csv" +idx: "id" +label_name: "y" +epochs: 30 +fit_intercept: True +method: "rmsprop" +penalty: "L2" +eta0: 0.1 +alpha: 0.5 +batch_size: 5000 \ No newline at end of file diff --git a/examples/benchmark_quality/lr/lr_benchmark.yaml b/examples/benchmark_quality/lr/lr_benchmark.yaml index 18cf77b34f..c857d20e3f 100644 --- a/examples/benchmark_quality/lr/lr_benchmark.yaml +++ b/examples/benchmark_quality/lr/lr_benchmark.yaml @@ -1,45 +1,48 @@ data: - - file: examples/data/breast_hetero_guest.csv + - file: examples/data/breast_hetero_guest_sid.csv meta: delimiter: "," dtype: float64 input_format: dense label_type: int64 label_name: y - match_id_name: id + match_id_name: "id" match_id_range: 0 + sample_id_name: "sid" tag_value_delimiter: ":" tag_with_value: false weight_type: float64 partitions: 4 head: true - extend_sid: true + extend_sid: false table_name: breast_hetero_guest namespace: experiment role: guest_0 - - file: examples/data/breast_hetero_host.csv + - file: examples/data/breast_hetero_host_sid.csv meta: delimiter: "," dtype: float64 input_format: dense - match_id_name: id + match_id_name: "id" match_id_range: 0 + sample_id_name: "sid" tag_value_delimiter: ":" tag_with_value: false weight_type: float64 partitions: 4 head: true - extend_sid: true + extend_sid: false table_name: breast_hetero_host namespace: experiment role: host_0 - - file: "../../data/default_credit_hetero_guest.csv" + - file: "../../data/default_credit_hetero_guest_sid.csv" meta: delimiter: "," dtype: float64 input_format: dense - match_id_name: id + match_id_name: "id" match_id_range: 0 + sample_id_name: "sid" label_type: int64 label_name: y tag_value_delimiter: ":" @@ -47,33 +50,35 @@ data: weight_type: float64 partitions: 4 head: true - extend_sid: true + extend_sid: false table_name: default_credit_hetero_guest namespace: experiment role: guest_0 - - file: "../../data/default_credit_hetero_host.csv" + - file: "../../data/default_credit_hetero_host_sid.csv" meta: delimiter: "," dtype: float64 input_format: dense - match_id_name: id + match_id_name: "id" match_id_range: 0 + sample_id_name: "sid" tag_value_delimiter: ":" tag_with_value: false weight_type: float64 partitions: 4 head: true - extend_sid: true + extend_sid: false table_name: default_credit_hetero_host namespace: experiment role: host_0 - - file: "../../data/give_credit_hetero_guest.csv" + - file: "../../data/give_credit_hetero_guest_sid.csv" meta: delimiter: "," dtype: float64 input_format: dense - match_id_name: id + match_id_name: "id" match_id_range: 0 + sample_id_name: "sid" label_type: int64 label_name: y tag_value_delimiter: ":" @@ -81,41 +86,100 @@ data: weight_type: float64 partitions: 4 head: true - extend_sid: true + extend_sid: false table_name: give_credit_hetero_guest namespace: experiment role: guest_0 - - file: "../../data/give_credit_hetero_host.csv" - head: 1 - partition: 16 + - file: "../../data/give_credit_hetero_host_sid.csv" + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: "id" + match_id_range: 0 + sample_id_name: "sid" + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + head: true + partition: 4 + extend_sid: false table_name: give_credit_hetero_host namespace: experiment role: host_0 - - file: "../../data/epsilon_5k_hetero_guest.csv" - head: 1 - partition: 16 + - file: "../../data/epsilon_5k_hetero_guest_sid.csv" + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: "id" + match_id_range: 0 + sample_id_name: "sid" + label_type: int64 + label_name: y + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + head: true + partition: 4 + extend_sid: false table_name: epsilon_5k_hetero_guest namespace: experiment role: guest_0 - - file: "../../data/epsilon_5k_hetero_host.csv" - head: 1 - partition: 16 + - file: "../../data/epsilon_5k_hetero_host_sid.csv" + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: "id" + match_id_range: 0 + sample_id_name: "sid" + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + head: true + partition: 4 + extend_sid: false table_name: epsilon_5k_hetero_host namespace: experiment role: host_0 - - file: "../../data/vehicle_scale_hetero_guest.csv" - head: 1 - partition: 16 + - file: "../../data/vehicle_scale_hetero_guest_sid.csv" + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: "id" + match_id_range: 0 + sample_id_name: "sid" + label_type: int64 + label_name: y + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + head: true + partition: 4 + extend_sid: false table_name: vehicle_scale_hetero_guest namespace: experiment role: guest_0 - - file: "../../data/vehicle_scale_hetero_host.csv" - head: 1 - partition: 16 + - file: "../../data/vehicle_scale_hetero_host_sid.csv" + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: "id" + match_id_range: 0 + sample_id_name: "sid" + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + head: true + partition: 4 + extend_sid: false table_name: vehicle_scale_hetero_host namespace: experiment role: host_0 -hetero_lr-binary-0: +hetero_lr-binary-0-breast: local: script: "./sklearn-lr-binary.py" conf: "./breast_lr_sklearn_config.yaml" @@ -124,39 +188,39 @@ hetero_lr-binary-0: conf: "./breast_config.yaml" compare_setting: relative_tol: 0.01 -hetero_lr-binary-1: +#hetero_lr-binary-1-default-credit: +# local: +# script: "./sklearn-lr-binary.py" +# conf: "./default_credit_lr_sklearn_config.yaml" +# FATE-hetero-lr: +# script: "./pipeline-lr-binary.py" +# conf: "./default_credit_config.yaml" +# compare_setting: +# relative_tol: 0.01 +hetero_lr-binary-2-epsilon-5k: local: script: "./sklearn-lr-binary.py" - conf: "./default_credit_config.yaml" - FATE-hetero-lr: - script: "./pipeline-lr-binary.py" - conf: "./default_credit_sklearn_config.yaml" - compare_setting: - relative_tol: 0.01 -hetero_lr-binary-2: - local: - script: "./sklearn-lr-binary.py" - conf: "./epsilon_5k_config.yaml" + conf: "./epsilon_5k_lr_sklearn_config.yaml" FATE-hetero-lr: script: "./pipeline-lr-binary.py" conf: "./epsilon_5k_config.yaml" compare_setting: relative_tol: 0.01 -hetero_lr-binary-3: - local: - script: "./sklearn-lr-binary.py" - conf: "./give_credit_config.yaml" - FATE-hetero-lr: - script: "./pipeline-lr-binary.py" - conf: "./give_credit_config.yaml" - compare_setting: - relative_tol: 0.01 -multi: - local: - script: "./sklearn-lr-multi.py" - conf: "./vehicle_lr_sklearn_config.yaml" - FATE-hetero-lr: - script: "./pipeline-lr-multi.py" - conf: "./vehicle_config.yaml" - compare_setting: - relative_tol: 0.01 +#hetero_lr-binary-3-give-credit: +# local: +# script: "./sklearn-lr-binary.py" +# conf: "./give_credit_lr_sklearn_config.yaml" +# FATE-hetero-lr: +# script: "./pipeline-lr-binary.py" +# conf: "./give_credit_config.yaml" +# compare_setting: +# relative_tol: 0.01 +#multi-vehicle: +# local: +# script: "./sklearn-lr-multi.py" +# conf: "./vehicle_lr_sklearn_config.yaml" +# FATE-hetero-lr: +# script: "./pipeline-lr-multi.py" +# conf: "./vehicle_config.yaml" +# compare_setting: +# relative_tol: 0.01 diff --git a/examples/benchmark_quality/lr/pipeline-lr-binary.py b/examples/benchmark_quality/lr/pipeline-lr-binary.py index bd845bc9d9..c10dd7fcb6 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-binary.py +++ b/examples/benchmark_quality/lr/pipeline-lr-binary.py @@ -24,7 +24,7 @@ from fate_test.utils import extract_data, parse_summary_result -def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): +def main(config="../../config.yaml", param="./breast_config.yaml", namespace=""): # obtain config if isinstance(config, str): config = test_utils.load_job_config(config) @@ -38,21 +38,8 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): assert isinstance(param, dict) - data_set = param.get("data_guest").split('/')[-1] - if data_set == "default_credit_hetero_guest.csv": - guest_data_table = 'default_credit_hetero_guest' - host_data_table = 'default_credit_hetero_host' - elif data_set == 'breast_hetero_guest.csv': - guest_data_table = 'breast_hetero_guest' - host_data_table = 'breast_hetero_host' - elif data_set == 'give_credit_hetero_guest.csv': - guest_data_table = 'give_credit_hetero_guest' - host_data_table = 'give_credit_hetero_host' - elif data_set == 'epsilon_5k_hetero_guest.csv': - guest_data_table = 'epsilon_5k_hetero_guest' - host_data_table = 'epsilon_5k_hetero_host' - else: - raise ValueError(f"Cannot recognized data_set: {data_set}") + guest_data_table = param.get("data_guest") + host_data_table = param.get("data_host") guest_train_data = {"name": guest_data_table, "namespace": f"experiment{namespace}"} host_train_data = {"name": host_data_table, "namespace": f"experiment{namespace}"} @@ -79,7 +66,7 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): lr_param.update(config_param) lr_0 = CoordinatedLR("lr_0", train_data=intersect_0.outputs["output_data"], - **config_param) + **lr_param) lr_1 = CoordinatedLR("lr_1", test_data=intersect_0.outputs["output_data"], input_model=lr_0.outputs["output_model"]) @@ -95,6 +82,10 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): pipeline.add_task(lr_1) pipeline.add_task(evaluation_0) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) pipeline.compile() print(pipeline.get_dag()) pipeline.fit() @@ -107,11 +98,9 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): lr_1_label = extract_data(lr_1_data, "y") lr_0_score_label = extract_data(lr_0_data, "predict_result", keep_id=True) lr_1_score_label = extract_data(lr_1_data, "predict_result", keep_id=True) - """print(f"evaluation result: {pipeline.get_task_info('evaluation_0').get_output_metric()};" - f"result type: {type(pipeline.get_task_info('evaluation_0').get_output_metric())}") - """ + result_summary = parse_summary_result(pipeline.get_task_info("evaluation_0").get_output_metric()[0]["data"]) - print(f"result_summary") + print(f"result_summary: {result_summary}") data_summary = {"train": {"guest": guest_train_data["name"], "host": host_train_data["name"]}, "test": {"guest": guest_train_data["name"], "host": host_train_data["name"]} diff --git a/examples/benchmark_quality/lr/pipeline-lr-multi.py b/examples/benchmark_quality/lr/pipeline-lr-multi.py index cc5e9602de..3868acbd60 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-multi.py +++ b/examples/benchmark_quality/lr/pipeline-lr-multi.py @@ -24,7 +24,7 @@ from fate_test.utils import extract_data, parse_summary_result -def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): +def main(config="../../config.yaml", param="./vehicle_config.yaml", namespace=""): # obtain config if isinstance(config, str): config = test_utils.load_job_config(config) @@ -37,12 +37,8 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): param = test_utils.JobConfig.load_from_file(param) assert isinstance(param, dict) - data_set = param.get("data_guest").split('/')[-1] - if data_set == "vehicle_scale_hetero_guest.csv": - guest_data_table = 'vehicle_scale_hetero_guest' - host_data_table = 'vehicle_scale_hetero_host' - else: - raise ValueError(f"Cannot recognized data_set: {data_set}") + guest_data_table = param.get("data_guest") + host_data_table = param.get("data_host") guest_train_data = {"name": guest_data_table, "namespace": f"experiment{namespace}"} host_train_data = {"name": host_data_table, "namespace": f"experiment{namespace}"} @@ -74,11 +70,16 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): test_data=intersect_0.outputs["output_data"], input_model=lr_0.outputs["output_model"]) - evaluation_0 = Evaluation('evaluation_0', default_eval_setting="multi") + evaluation_0 = Evaluation('evaluation_0', + metrics=['multi_recall', 'multi_accuracy', 'multi_precision']) pipeline.add_task(intersect_0) pipeline.add_task(lr_0) pipeline.add_task(lr_1) pipeline.add_task(evaluation_0) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) pipeline.compile() print(pipeline.get_dag()) diff --git a/examples/benchmark_quality/lr/sklearn-lr-binary.py b/examples/benchmark_quality/lr/sklearn-lr-binary.py index 603c39d75f..e418a1297a 100644 --- a/examples/benchmark_quality/lr/sklearn-lr-binary.py +++ b/examples/benchmark_quality/lr/sklearn-lr-binary.py @@ -84,7 +84,9 @@ def main(config="../../config.yaml", param="./breast_lr_sklearn_config.yaml"): if __name__ == "__main__": parser = argparse.ArgumentParser("BENCHMARK-QUALITY SKLEARN JOB") - parser.add_argument("-p", "--param", type=str, default="./breast_lr_sklearn_config.yaml", - help="config file for params") + parser.add_argument("-c", "--config", type=str, + help="config file", default="../../config.yaml") + parser.add_argument("-p", "--param", type=str, + help="config file for params", default="./breast_lr_sklearn_config.yaml") args = parser.parse_args() - main(param=args.param) + main(args.config, args.param) diff --git a/examples/benchmark_quality/lr/sklearn-lr-multi.py b/examples/benchmark_quality/lr/sklearn-lr-multi.py index fb8da4827f..ae931db9fb 100644 --- a/examples/benchmark_quality/lr/sklearn-lr-multi.py +++ b/examples/benchmark_quality/lr/sklearn-lr-multi.py @@ -23,7 +23,7 @@ from sklearn.metrics import precision_score, accuracy_score, recall_score -def main(config="../../config.yaml", param="./vehicle_config.yaml"): +def main(config="../../config.yaml", param="./vehicle_lr_sklearn_config.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) @@ -65,15 +65,18 @@ def main(config="../../config.yaml", param="./vehicle_config.yaml"): pr = precision_score(y, y_pred, average="macro") acc = accuracy_score(y, y_pred) - result = {"accuracy": acc} + result = {"multi_accuracy": acc, + "multi_precision": pr, + "multi_recall": recall} print(result) return {}, result if __name__ == "__main__": parser = argparse.ArgumentParser("BENCHMARK-QUALITY SKLEARN JOB") - parser.add_argument("-param", type=str, - help="config file for params") + parser.add_argument("-c", "--config", type=str, + help="config file", default="../../config.yaml") + parser.add_argument("-p", "--param", type=str, + help="config file for params", default="./vehicle_lr_sklearn_config.yaml") args = parser.parse_args() - if args.param is not None: - main(args.param) + main(args.config, args.param) diff --git a/examples/benchmark_quality/lr/vehicle_config.yaml b/examples/benchmark_quality/lr/vehicle_config.yaml index 2cff7f33b5..00a610c0ee 100644 --- a/examples/benchmark_quality/lr/vehicle_config.yaml +++ b/examples/benchmark_quality/lr/vehicle_config.yaml @@ -1,11 +1,11 @@ -data_guest: "examples/data/vehicle_scale_hetero_guest.csv" -data_host: "examples/data/vehicle_scale_hetero_host.csv" +data_guest: "vehicle_scale_hetero_guest" +data_host: "vehicle_scale_hetero_host" idx: "id" label_name: "y" epochs: 20 init_param: fit_intercept: True - method: "zeros" + method: "random_uniform" learning_rate_scheduler: method: "constant" scheduler_params: @@ -14,7 +14,10 @@ learning_rate_scheduler: optimizer: method: "adam" penalty: "L2" + alpha: 0.00001 optimizer_params: lr: 0.3 batch_size: 16 -early_stop: "diff" \ No newline at end of file +early_stop: "diff" +task_cores: null +timeout: 3600 \ No newline at end of file diff --git a/examples/benchmark_quality/lr/vehicle_lr_sklearn_config.yaml b/examples/benchmark_quality/lr/vehicle_lr_sklearn_config.yaml index 0d9bda1717..4fdb56e151 100644 --- a/examples/benchmark_quality/lr/vehicle_lr_sklearn_config.yaml +++ b/examples/benchmark_quality/lr/vehicle_lr_sklearn_config.yaml @@ -3,17 +3,9 @@ data_host: "examples/data/vehicle_scale_hetero_host.csv" idx: "id" label_name: "y" epochs: 30 -init_param: - fit_intercept: True - method: "zeros" -learning_rate_scheduler: - method: "constant" - scheduler_params: - lr: 0.15 - factor: 1.0 - total_iters: 800' -optimizer: - method: "rmsprop" - penalty: "L2" -batch_size: None -early_stop: "diff" \ No newline at end of file +fit_intercept: True +method: "rmsprop" +penalty: "L2" +eta0: 0.1 +alpha: 0.5 +batch_size: 5000 \ No newline at end of file diff --git a/examples/pipeline/coordinated_lr/test_lr_sid.py b/examples/pipeline/coordinated_lr/test_lr_sid.py index 5fb0905ff1..9b2323fb05 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid.py +++ b/examples/pipeline/coordinated_lr/test_lr_sid.py @@ -31,6 +31,10 @@ def main(config="./config.yaml", namespace=""): arbiter = parties.arbiter[0] pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) intersect_0 = Intersection("intersect_0", method="raw") intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", diff --git a/examples/pipeline/coordinated_lr/test_lr_sid_cv.py b/examples/pipeline/coordinated_lr/test_lr_sid_cv.py index 16ce51d4a7..5e5a3f40bc 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid_cv.py +++ b/examples/pipeline/coordinated_lr/test_lr_sid_cv.py @@ -29,6 +29,10 @@ def main(config="./config.yaml", namespace=""): host = parties.host[0] arbiter = parties.arbiter[0] pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) intersect_0 = Intersection("intersect_0", method="raw") intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", diff --git a/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py b/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py index fb8090064d..0c33c952d6 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py +++ b/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py @@ -30,6 +30,10 @@ def main(config="./config.yaml", namespace=""): host = parties.host[0] arbiter = parties.arbiter[0] pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) intersect_0 = Intersection("intersect_0", method="raw") intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", diff --git a/examples/pipeline/test_linr_sid_warm_start.py b/examples/pipeline/test_linr_sid_warm_start.py index 14837e09a9..0fe2bdea06 100644 --- a/examples/pipeline/test_linr_sid_warm_start.py +++ b/examples/pipeline/test_linr_sid_warm_start.py @@ -21,19 +21,27 @@ intersect_0 = Intersection("intersect_0", method="raw") intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) + namespace="experiment")) intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) + namespace="experiment")) linr_0 = CoordinatedLinR("linr_0", epochs=3, batch_size=None, - optimizer={"method": "sgd", "optimizer_params": {"lr": 0.01}}, + optimizer={"method": "sgd", "optimizer_params": {"lr": 0.15}, "alpha": 0.1}, init_param={"fit_intercept": True, "method": "zeros"}, - train_data=intersect_0.outputs["output_data"]) + train_data=intersect_0.outputs["output_data"], + shuffle=False) linr_1 = CoordinatedLinR("linr_1", train_data=intersect_0.outputs["output_data"], warm_start_model=linr_0.outputs["output_model"], epochs=2, - batch_size=200) + batch_size=None) +linr_2 = CoordinatedLinR("linr_2", + epochs=5, + batch_size=None, + optimizer={"method": "sgd", "optimizer_params": {"lr": 0.15}, "alpha": 0.1}, + init_param={"fit_intercept": True, "method": "zeros"}, + train_data=intersect_0.outputs["output_data"], + shuffle=False) """linr_0.guest.component_setting(train_data=DataWarehouseChannel(name="breast_hetero_guest_sid", namespace="experiment")) @@ -42,40 +50,40 @@ evaluation_0 = Evaluation("evaluation_0", runtime_roles=["guest"], - input_data=linr_0.outputs["train_output_data"]) + metrics=["r2_score", "mse"], + label_column_name="y", + input_data=[linr_1.outputs["train_output_data"], linr_2.outputs["train_output_data"]]) # pipeline.add_task(feature_scale_0) # pipeline.add_task(feature_scale_1) pipeline.add_task(intersect_0) pipeline.add_task(linr_0) pipeline.add_task(linr_1) -# pipeline.add_task(evaluation_0) +pipeline.add_task(linr_2) +pipeline.add_task(evaluation_0) # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() print(pipeline.get_dag()) pipeline.fit() -print(f"linr_0 model: {pipeline.get_task_info('linr_0').get_output_model()}") -# print(f"linr_0 data: {pipeline.get_task_info('linr_0').get_output_data()}") -print(f"\nlinr_1 model: {pipeline.get_task_info('linr_1').get_output_model()}") - -"""# print(pipeline.get_task_info("statistics_0").get_output_model()) -print(pipeline.get_task_info("linr_0").get_output_model()) -print(pipeline.get_task_info("linr_0").get_output_metrics()) -print(f"evaluation metrics: ") -print(pipeline.get_task_info("evaluation_0").get_output_metrics()) +import numpy as np -pipeline.deploy([intersect_0, linr_0]) +linr_0_coef = np.array( + pipeline.get_task_info('linr_0').get_output_model()["output_model"]["data"]['estimator']["param"]["coef_"]) +linr_0_intercept = np.array( + pipeline.get_task_info('linr_0').get_output_model()["output_model"]["data"]['estimator']["param"]["intercept_"]) -predict_pipeline = FateFlowPipeline() +linr_1_coef = np.array( + pipeline.get_task_info('linr_1').get_output_model()["output_model"]["data"]['estimator']["param"]["coef_"]) +linr_1_intercept = np.array( + pipeline.get_task_info('linr_1').get_output_model()["output_model"]["data"]['estimator']["param"]["intercept_"]) +# print(f"linr_1 data: {pipeline.get_task_info('linr_0').get_output_data()}") +linr_2_coef = np.array( + pipeline.get_task_info('linr_2').get_output_model()["output_model"]["data"]['estimator']["param"]["coef_"]) +linr_2_intercept = np.array( + pipeline.get_task_info('linr_2').get_output_model()["output_model"]["data"]['estimator']["param"]["intercept_"]) -deployed_pipeline = pipeline.get_deployed_pipeline() -deployed_pipeline.intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) -deployed_pipeline.intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) +print(f"linr_1 coef: {linr_1_coef}, intercept: {linr_1_intercept}") +print(f"linr_2 coef: {linr_2_coef}, intercept: {linr_2_intercept}") +print(f"linr_1 vs l2_1 coef diff: {linr_1_coef - linr_2_coef}, intercept diff: {linr_1_intercept - linr_2_intercept}") -predict_pipeline.add_task(deployed_pipeline) -predict_pipeline.compile() -# print("\n\n\n") -# print(predict_pipeline.compile().get_dag()) -predict_pipeline.predict()""" +print(f"\n evaluation result: {pipeline.get_task_info('evaluation_0').get_output_metric()[0]['data']}") diff --git a/python/fate_test/fate_test/_config.py b/python/fate_test/fate_test/_config.py index b81b25e59e..16d6fb1e34 100644 --- a/python/fate_test/fate_test/_config.py +++ b/python/fate_test/fate_test/_config.py @@ -181,6 +181,8 @@ def __init__(self, config): self.tunnel_id_to_tunnel = {} self.extend_sid = None self.auto_increasing_sid = None + self.task_cores = None + self.timeout = None # self.work_mode = config.get("work_mode", 0) service_id = 0 @@ -194,6 +196,10 @@ def __init__(self, config): for party in flow_service["parties"]: self.party_to_service_id[party] = service_id + def update_conf(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + @staticmethod def load(path: typing.Union[str, Path], **kwargs): if isinstance(path, str): diff --git a/python/fate_test/fate_test/_flow_client.py b/python/fate_test/fate_test/_flow_client.py index e27c2098e0..e8aa76fdbc 100644 --- a/python/fate_test/fate_test/_flow_client.py +++ b/python/fate_test/fate_test/_flow_client.py @@ -175,6 +175,10 @@ def _delete_data(self, table_name, namespace): response = self._client.table.delete(namespace=namespace, table_name=table_name) return response + def query_job(self, job_id, role, party_id): + response = self._client.task.query(job_id, role=role, party_id=party_id) + return response + """def _submit_job(self, conf, dsl): param = { 'job_dsl': self._save_json(dsl, 'submit_dsl.json'), diff --git a/python/fate_test/fate_test/_parser.py b/python/fate_test/fate_test/_parser.py index 7d2e898382..114ec71f56 100644 --- a/python/fate_test/fate_test/_parser.py +++ b/python/fate_test/fate_test/_parser.py @@ -192,16 +192,16 @@ def pretty_final_summary(self, time_consuming, suite_file=None): return table.get_string(title=f"{TxtStyle.TITLE}Testsuite Summary: {self.suite_name}{TxtStyle.END}") - def model_in_dep(self, name): + """def model_in_dep(self, name): return name in self._dependency - """def get_dependent_jobs(self, name): - return self._dependency[name]""" + def get_dependent_jobs(self, name): + return self._dependency[name] def remove_dependency(self, name): del self._dependency[name] - """def feed_dep_info(self, job, name, model_info=None, table_info=None, cache_info=None, model_loader_info=None): + def feed_dep_info(self, job, name, model_info=None, table_info=None, cache_info=None, model_loader_info=None): if model_info is not None: job.set_pre_work(name, **model_info) if table_info is not None: @@ -213,7 +213,7 @@ def remove_dependency(self, name): if name in job.pre_works: job.pre_works.remove(name) if job.is_submit_ready(): - self._ready_jobs.appendleft(job)""" + self._ready_jobs.appendleft(job) def reflash_configs(self, config: Config): failed = [] @@ -225,7 +225,7 @@ def reflash_configs(self, config: Config): except ValueError as e: failed.append((job, e)) return failed - + """ def update_status( self, job_name, job_id: str = None, status: str = None, exception_id: str = None ): @@ -325,6 +325,36 @@ def load(path: Path): return suite +class PerformanceSuite(object): + def __init__( + self, dataset: typing.List[Data], pipeline_jobs: typing.List[BenchmarkJob], path: Path + ): + self.dataset = dataset + self.pipeline_jobs = pipeline_jobs + self.path = path + + @staticmethod + def load(path: Path): + with path.open("r") as f: + # testsuite_config = json.load(f, object_hook=DATA_JSON_HOOK.hook) + testsuite_config = yaml.safe_load(f) + # testsuite_config = DATA_JSON_HOOK.hook(testsuite_config) + + dataset = [] + for d in testsuite_config.get("data"): + d = DATA_LOAD_HOOK.hook(d) + dataset.append(Data.load(d, path)) + + pipeline_jobs = [] + for job_name, job_configs in testsuite_config.get("tasks", {}).items(): + script_path = path.parent.joinpath(job_configs["script"]).resolve() + config_path = path.parent.joinpath(job_configs.get("conf", "")).resolve() + pipeline_jobs.append(BenchmarkJob(job_name, script_path, config_path)) + + suite = PerformanceSuite(dataset, pipeline_jobs, path) + return suite + + def non_success_summary(): status = {} for job in _config.non_success_jobs: diff --git a/python/fate_test/fate_test/scripts/_utils.py b/python/fate_test/fate_test/scripts/_utils.py index 8445e55bc1..53ffc92859 100644 --- a/python/fate_test/fate_test/scripts/_utils.py +++ b/python/fate_test/fate_test/scripts/_utils.py @@ -10,7 +10,7 @@ from fate_test._config import Config from fate_test._flow_client import DataProgress, UploadDataResponse, QueryJobResponse from fate_test._io import echo, LOGGER, set_logger -from fate_test._parser import Testsuite, BenchmarkSuite, DATA_LOAD_HOOK, CONF_LOAD_HOOK, DSL_LOAD_HOOK +from fate_test._parser import Testsuite, BenchmarkSuite, PerformanceSuite, DATA_LOAD_HOOK, CONF_LOAD_HOOK, DSL_LOAD_HOOK def _big_data_task(includes, guest_data_size, host_data_size, guest_feature_num, host_feature_num, host_data_type, @@ -82,6 +82,8 @@ def _find_testsuite_files(path): suite = Testsuite.load(suite_path.resolve(), provider) elif suite_type == "benchmark": suite = BenchmarkSuite.load(suite_path.resolve()) + elif suite_type == "performance": + suite = PerformanceSuite.load(suite_path.resolve()) else: raise ValueError(f"Unsupported suite type: {suite_type}. Only accept type 'testsuite' or 'benchmark'.") except Exception as e: diff --git a/python/fate_test/fate_test/scripts/benchmark_cli.py b/python/fate_test/fate_test/scripts/benchmark_cli.py index a6ba20383f..4c5d84b8ee 100644 --- a/python/fate_test/fate_test/scripts/benchmark_cli.py +++ b/python/fate_test/fate_test/scripts/benchmark_cli.py @@ -19,9 +19,12 @@ @click.command(name="benchmark-quality") @click.option('-i', '--include', required=True, type=click.Path(exists=True), multiple=True, metavar="", - help="include *benchmark.json under these paths") + help="include *benchmark.yaml under these paths") @click.option('-e', '--exclude', type=click.Path(exists=True), multiple=True, - help="exclude *benchmark.json under these paths") + help="exclude *benchmark.yaml under these paths") +@click.option('-p', '--task-cores', type=int, help="processors per node", default=None) +@click.option('-m', '--timeout', type=int, default=None, + help="maximum running time of job") @click.option('-g', '--glob', type=str, help="glob string to filter sub-directory of path specified by ") @click.option('-t', '--tol', type=float, @@ -35,12 +38,14 @@ default="all", help="Error value display in algorithm comparison") @click.option('--skip-data', is_flag=True, default=False, help="skip uploading data specified in benchmark conf") +@click.option("--data-only", is_flag=True, default=False, + help="upload data only") @click.option("--disable-clean-data", "clean_data", flag_value=False, default=None) @click.option("--enable-clean-data", "clean_data", flag_value=True, default=None) @SharedOptions.get_shared_options(hidden=True) @click.pass_context def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, storage_tag, history_tag, match_details, - **kwargs): + task_cores, timeout, **kwargs): """ process benchmark suite, alias: bq """ @@ -50,6 +55,10 @@ def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, stora config_inst = ctx.obj["config"] if ctx.obj["extend_sid"] is not None: config_inst.extend_sid = ctx.obj["extend_sid"] + if task_cores is not None: + config_inst.update_conf(task_cores=task_cores) + if timeout is not None: + config_inst.update_conf(timeout=timeout) """if ctx.obj["auto_increasing_sid"] is not None: config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"]""" @@ -81,6 +90,8 @@ def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, stora _upload_data(client, suite, config_inst) except Exception as e: raise RuntimeError(f"exception occur while uploading data for {suite.path}") from e + if kwargs.get("data_only"): + continue try: _run_benchmark_pairs(config_inst, suite, tol, namespace, data_namespace_mangling, storage_tag, history_tag, fate_version, match_details) diff --git a/python/fate_test/fate_test/scripts/performance_cli.py b/python/fate_test/fate_test/scripts/performance_cli.py index e07791cc9a..0f120de1fd 100644 --- a/python/fate_test/fate_test/scripts/performance_cli.py +++ b/python/fate_test/fate_test/scripts/performance_cli.py @@ -14,35 +14,33 @@ # limitations under the License. # import glob -import json import os import time import uuid from datetime import timedelta +from inspect import signature +from ruamel import yaml import click from fate_test._client import Clients from fate_test._config import Config -from fate_test._flow_client import JobProgress, QueryJobResponse from fate_test._io import LOGGER, echo -from fate_test._parser import Testsuite +from fate_test._parser import PerformanceSuite from fate_test.scripts._options import SharedOptions from fate_test.scripts._utils import _load_testsuites, _upload_data, _delete_data, _load_module_from_script, \ _add_replace_hook -from fate_test.utils import TxtStyle +from fate_test.utils import TxtStyle, parse_job_time_info, pretty_time_info_summary from prettytable import PrettyTable, ORGMODE -from fate_test import _config - @click.command("performance") @click.option('-t', '--job-type', type=click.Choice(['intersect', 'intersect_multi', 'hetero_lr', 'hetero_sbt']), help="Select the job type, you can also set through include") @click.option('-i', '--include', type=click.Path(exists=True), multiple=True, metavar="", - help="include *testsuite.json under these paths") -@click.option('-m', '--timeout', type=int, default=3600, - help="maximun running time of job") -@click.option('-e', '--max-iter', type=int, help="When the algorithm model is LR, the number of iterations is set") + help="include *performance.yaml under these paths") +@click.option('-m', '--timeout', type=int, + help="maximum running time of job") +@click.option('-e', '--epochs', type=int, help="When the algorithm model is LR, the number of iterations is set") @click.option('-d', '--max-depth', type=int, help="When the algorithm model is SecureBoost, set the number of model layers") @click.option('-nt', '--num-trees', type=int, help="When the algorithm model is SecureBoost, set the number of trees") @@ -58,7 +56,7 @@ @click.option("--disable-clean-data", "clean_data", flag_value=False, default=None) @SharedOptions.get_shared_options(hidden=True) @click.pass_context -def run_task(ctx, job_type, include, replace, timeout, update_job_parameters, update_component_parameters, max_iter, +def run_task(ctx, job_type, include, replace, timeout, epochs, max_depth, num_trees, task_cores, storage_tag, history_tag, skip_data, clean_data, provider, **kwargs): """ Test the performance of big data tasks, alias: bp @@ -68,8 +66,12 @@ def run_task(ctx, job_type, include, replace, timeout, update_job_parameters, up config_inst = ctx.obj["config"] if ctx.obj["extend_sid"] is not None: config_inst.extend_sid = ctx.obj["extend_sid"] - if ctx.obj["auto_increasing_sid"] is not None: - config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"] + if task_cores is not None: + config_inst.update_conf(task_cores=task_cores) + if timeout is not None: + config_inst.update_conf(timeout=timeout) + """if ctx.obj["auto_increasing_sid"] is not None: + config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"]""" namespace = ctx.obj["namespace"] yes = ctx.obj["yes"] data_namespace_mangling = ctx.obj["namespace_mangling"] @@ -77,7 +79,7 @@ def run_task(ctx, job_type, include, replace, timeout, update_job_parameters, up clean_data = config_inst.clean_data def get_perf_template(conf: Config, job_type): - perf_dir = os.path.join(os.path.abspath(conf.perf_template_dir) + '/' + job_type + '/' + "*testsuite.json") + perf_dir = os.path.join(os.path.abspath(conf.perf_template_dir) + '/' + job_type + '/' + "*testsuite.yaml") return glob.glob(perf_dir) if not include: @@ -88,7 +90,8 @@ def get_perf_template(conf: Config, job_type): echo.welcome() echo.echo(f"testsuite namespace: {namespace}", fg='red') echo.echo("loading testsuites:") - suites = _load_testsuites(includes=include, excludes=tuple(), glob=None, provider=provider) + suites = _load_testsuites(includes=include, excludes=tuple(), glob=None, provider=provider, + suffix="performance.yaml", suite_type="performance") for i, suite in enumerate(suites): echo.echo(f"\tdataset({len(suite.dataset)}) dsl jobs({len(suite.jobs)}) {suite.path}") @@ -112,21 +115,41 @@ def get_perf_template(conf: Config, job_type): echo.stdout_newline() try: - time_consuming = _submit_job(client, suite, namespace, config_inst, timeout, update_job_parameters, - storage_tag, history_tag, update_component_parameters, max_iter, - max_depth, num_trees, task_cores) - except Exception as e: - raise RuntimeError(f"exception occur while submit job for {suite.path}") from e - - try: - _run_pipeline_jobs(config_inst, suite, namespace, data_namespace_mangling) + job_time_info = _run_performance_jobs(config_inst, suite, namespace, data_namespace_mangling, client, + epochs, max_depth, num_trees) except Exception as e: raise RuntimeError(f"exception occur while running pipeline jobs for {suite.path}") from e echo.echo(f"[{i + 1}/{len(suites)}]elapse {timedelta(seconds=int(time.time() - start))}", fg='red') if not skip_data and clean_data: _delete_data(client, suite) - echo.echo(suite.pretty_final_summary(time_consuming), fg='red') + # echo.echo(suite.pretty_final_summary(job_time_info), fg='red') + all_summary = [] + compare_summary = [] + for job_name, job_time in job_time_info.items(): + performance_dir = "/".join( + [os.path.join(os.path.abspath(config_inst.cache_directory), + 'benchmark_history', "performance.yaml")]) + # @todo: change to client query result + # fate_version = clients["guest_0"].get_version() + fate_version = "beta-2.0.0" + if history_tag: + history_tag = ["_".join([i, job_name]) for i in history_tag] + history_compare_result = comparison_quality(job_name, + history_tag, + performance_dir, + job_time["time_summary"]) + compare_summary.append(history_compare_result) + if storage_tag: + storage_tag = "_".join(['FATE', fate_version, storage_tag, job_name]) + save_quality(storage_tag, performance_dir, job_time["time_summary"]) + res_str = pretty_time_info_summary(job_time, job_name) + all_summary.append(res_str) + echo.echo("\n".join(all_summary)) + echo.echo("#" * 60) + echo.echo("\n".join(compare_summary)) + + echo.echo() except Exception: exception_id = uuid.uuid1() @@ -139,195 +162,65 @@ def get_perf_template(conf: Config, job_type): echo.echo(f"testsuite namespace: {namespace}", fg='red') -def _submit_job(clients: Clients, suite: Testsuite, namespace: str, config: Config, timeout, update_job_parameters, - storage_tag, history_tag, update_component_parameters, max_iter, max_depth, num_trees, task_cores): - # submit jobs - with click.progressbar(length=len(suite.jobs), - label="jobs", - show_eta=False, - show_pos=True, - width=24) as bar: - time_list = [] - for job in suite.jobs_iter(): - start = time.time() - job_progress = JobProgress(job.job_name) - - def _raise(): - exception_id = str(uuid.uuid1()) - job_progress.exception(exception_id) - suite.update_status(job_name=job.job_name, exception_id=exception_id) - echo.file(f"exception({exception_id})") - LOGGER.exception(f"exception id: {exception_id}") - - # noinspection PyBroadException - try: - if max_iter is not None: - job.job_conf.update_component_parameters('max_iter', max_iter) - if max_depth is not None: - job.job_conf.update_component_parameters('max_depth', max_depth) - if num_trees is not None: - job.job_conf.update_component_parameters('num_trees', num_trees) - if task_cores is not None: - job.job_conf.update_job_common_parameters(task_cores=task_cores) - job.job_conf.update(config.parties, timeout, update_job_parameters, update_component_parameters) - except Exception: - _raise() - continue - - def update_bar(n_step): - bar.item_show_func = lambda x: job_progress.show() - time.sleep(0.1) - bar.update(n_step) - - update_bar(1) - - def _call_back(resp): - """if isinstance(resp, SubmitJobResponse): - job_progress.submitted(resp.job_id) - echo.file(f"[jobs] {resp.job_id} ", nl=False) - suite.update_status(job_name=job.job_name, job_id=resp.job_id)""" - - if isinstance(resp, QueryJobResponse): - job_progress.running(resp.status, resp.progress) - - update_bar(0) - - # noinspection PyBroadException - try: - response = clients["guest_0"].submit_job(job=job, callback=_call_back) - - # noinspection PyBroadException - try: - # add notes - notes = f"{job.job_name}@{suite.path}@{namespace}" - for role, party_id_list in job.job_conf.role.items(): - for i, party_id in enumerate(party_id_list): - clients[f"{role}_{i}"].add_notes(job_id=response.job_id, role=role, party_id=party_id, - notes=notes) - except Exception: - pass - except Exception: - _raise() - else: - job_progress.final(response.status) - suite.update_status(job_name=job.job_name, status=response.status.status) - if response.status.is_success(): - if suite.model_in_dep(job.job_name): - dependent_jobs = suite.get_dependent_jobs(job.job_name) - for predict_job in dependent_jobs: - model_info, table_info, cache_info, model_loader_info = None, None, None, None - for i in _config.deps_alter[predict_job.job_name]: - if isinstance(i, dict): - name = i.get('name') - data_pre = i.get('data') - - if 'data_deps' in _config.deps_alter[predict_job.job_name]: - roles = list(data_pre.keys()) - table_info, hierarchy = [], [] - for role_ in roles: - role, index = role_.split("_") - input_ = data_pre[role_] - for data_input, cpn in input_.items(): - try: - table_name = clients["guest_0"].output_data_table( - job_id=response.job_id, - role=role, - party_id=config.role[role][int(index)], - component_name=cpn) - except Exception: - _raise() - if predict_job.job_conf.dsl_version == 2: - hierarchy.append([role, index, data_input]) - table_info.append({'table': table_name}) - else: - hierarchy.append([role, 'args', 'data']) - table_info.append({data_input: [table_name]}) - table_info = {'hierarchy': hierarchy, 'table_info': table_info} - if 'model_deps' in _config.deps_alter[predict_job.job_name]: - if predict_job.job_conf.dsl_version == 2: - # noinspection PyBroadException - try: - model_info = clients["guest_0"].deploy_model( - model_id=response.model_info["model_id"], - model_version=response.model_info["model_version"], - dsl=predict_job.job_dsl.as_dict()) - except Exception: - _raise() - else: - model_info = response.model_info - if 'cache_deps' in _config.deps_alter[predict_job.job_name]: - cache_dsl = predict_job.job_dsl.as_dict() - cache_info = [] - for cpn in cache_dsl.get("components").keys(): - if "CacheLoader" in cache_dsl.get("components").get(cpn).get("module"): - cache_info.append({cpn: {'job_id': response.job_id}}) - cache_info = {'hierarchy': [""], 'cache_info': cache_info} - if 'model_loader_deps' in _config.deps_alter[predict_job.job_name]: - model_loader_dsl = predict_job.job_dsl.as_dict() - model_loader_info = [] - for cpn in model_loader_dsl.get("components").keys(): - if "ModelLoader" in model_loader_dsl.get("components").get(cpn).get("module"): - model_loader_info.append({cpn: response.model_info}) - model_loader_info = {'hierarchy': [""], 'model_loader_info': model_loader_info} - - suite.feed_dep_info(predict_job, name, model_info=model_info, table_info=table_info, - cache_info=cache_info, model_loader_info=model_loader_info) - suite.remove_dependency(job.job_name) - update_bar(0) - time_consuming = time.time() - start - performance_dir = "/".join( - [os.path.join(os.path.abspath(config.cache_directory), 'benchmark_history', "performance.json")]) - fate_version = clients["guest_0"].get_version() - if history_tag: - history_tag = ["_".join([i, job.job_name]) for i in history_tag] - comparison_quality(job.job_name, history_tag, performance_dir, time_consuming) - if storage_tag: - storage_tag = "_".join(['FATE', fate_version, storage_tag, job.job_name]) - save_quality(storage_tag, performance_dir, time_consuming) - echo.stdout_newline() - time_list.append(time_consuming) - return [str(int(i)) + "s" for i in time_list] - - -def _run_pipeline_jobs(config: Config, suite: Testsuite, namespace: str, data_namespace_mangling: bool): +@LOGGER.catch +def _run_performance_jobs(config: Config, suite: PerformanceSuite, tol: float, namespace: str, + data_namespace_mangling: bool, client, epochs, max_depth, num_trees): # pipeline demo goes here job_n = len(suite.pipeline_jobs) - for i, pipeline_job in enumerate(suite.pipeline_jobs): - echo.echo(f"Running [{i + 1}/{job_n}] job: {pipeline_job.job_name}") - - def _raise(err_msg, status="failed"): - exception_id = str(uuid.uuid1()) - suite.update_status(job_name=job_name, exception_id=exception_id, status=status) - echo.file(f"exception({exception_id}), error message:\n{err_msg}") - # LOGGER.exception(f"exception id: {exception_id}") - - job_name, script_path = pipeline_job.job_name, pipeline_job.script_path - mod = _load_module_from_script(script_path) + fate_base = config.fate_base + PYTHONPATH = os.environ.get('PYTHONPATH') + ":" + os.path.join(fate_base, "python") + os.environ['PYTHONPATH'] = PYTHONPATH + job_time_history = {} + for j, job in enumerate(suite.pipeline_jobs): try: - if data_namespace_mangling: - try: - mod.main(config=config, namespace=f"_{namespace}") - suite.update_status(job_name=job_name, status="success") - except Exception as e: - _raise(e) - continue + echo.echo(f"Running [{j + 1}/{job_n}] job: {job.job_name}") + job_name, script_path, conf_path = job.job_name, job.script_path, job.conf_path + param = Config.load_from_file(conf_path) + if epochs is not None: + param['epochs'] = epochs + if max_depth is not None: + param['max_depth'] = max_depth + if num_trees is not None: + param['num_trees'] = num_trees + + mod = _load_module_from_script(script_path) + input_params = signature(mod.main).parameters + # local script + if len(input_params) == 1: + job_id = mod.main(param=param) + elif len(input_params) == 2: + job_id = mod.main(config=config, param=param) + # pipeline script + elif len(input_params) == 3: + if data_namespace_mangling: + job_id = mod.main(config=config, param=param, namespace=f"_{namespace}") + else: + job_id = mod.main(config=config, param=param) else: - try: - mod.main(config=config) - suite.update_status(job_name=job_name, status="success") - except Exception as e: - _raise(e) - continue + job_id = mod.main() + echo.echo(f"[{j + 1}/{job_n}] job: {job.job_name} Success!\n") + ret_msg = client.query_time_elapse(job_id, role="guest", party_id=config.parties.guest[0]).get("data") + time_summary = parse_job_time_info(ret_msg) + job_time_history[job_name] = {"job_id": job_id, "time_summary": time_summary} + echo.echo(f"[{j + 1}/{job_n}] job: {job.job_name} time info: {time_summary}\n") + except Exception as e: - _raise(e, status="not submitted") + exception_id = uuid.uuid1() + echo.echo(f"exception while running [{j + 1}/{job_n}] job, exception_id={exception_id}", err=True, + fg='red') + LOGGER.exception(f"exception id: {exception_id}, error message: \n{e}") continue + return job_time_history def comparison_quality(group_name, history_tags, history_info_dir, time_consuming): assert os.path.exists(history_info_dir), f"Please check the {history_info_dir} Is it deleted" with open(history_info_dir, 'r') as f: - benchmark_quality = json.load(f, object_hook=dict) + benchmark_quality = yaml.load(f) benchmark_performance = {} + table = PrettyTable() + table.set_style(ORGMODE) + table.field_names = ["Script Model Name", "component", "time consuming"] for history_tag in history_tags: for tag in benchmark_quality: if '_'.join(tag.split("_")[2:]) == history_tag: @@ -335,28 +228,28 @@ def comparison_quality(group_name, history_tags, history_info_dir, time_consumin if benchmark_performance is not None: benchmark_performance[group_name] = time_consuming - table = PrettyTable() - table.set_style(ORGMODE) - table.field_names = ["Script Model Name", "time consuming"] for script_model_name in benchmark_performance: - table.add_row([f"{script_model_name}"] + - [f"{TxtStyle.FIELD_VAL}{benchmark_performance[script_model_name]}{TxtStyle.END}"]) - print("\n") - print(table.get_string(title=f"{TxtStyle.TITLE}Performance comparison results{TxtStyle.END}")) - print("#" * 60) + for cpn, time in benchmark_performance[script_model_name].items(): + table.add_row([f"{script_model_name}"] + + [f"{TxtStyle.FIELD_VAL}{cpn}{TxtStyle.END}"] + + [f"{TxtStyle.FIELD_VAL}{time}{TxtStyle.END}"]) + # print("\n") + # print(table.get_string(title=f"{TxtStyle.TITLE}Performance comparison results{TxtStyle.END}")) + # print("#" * 60) + return table.get_string(title=f"{TxtStyle.TITLE}Performance comparison results{TxtStyle.END}") def save_quality(storage_tag, save_dir, time_consuming): os.makedirs(os.path.dirname(save_dir), exist_ok=True) if os.path.exists(save_dir): with open(save_dir, 'r') as f: - benchmark_quality = json.load(f, object_hook=dict) + benchmark_quality = yaml.load(f) else: benchmark_quality = {} benchmark_quality.update({storage_tag: time_consuming}) try: with open(save_dir, 'w') as fp: - json.dump(benchmark_quality, fp, indent=2) + yaml.dump(benchmark_quality, fp) print("\n" + "Storage successful, please check: ", save_dir) except Exception: print("\n" + "Storage failed, please check: ", save_dir) diff --git a/python/fate_test/fate_test/scripts/testsuite_cli.py b/python/fate_test/fate_test/scripts/testsuite_cli.py index 3bc295bfe1..b58c9ee73d 100644 --- a/python/fate_test/fate_test/scripts/testsuite_cli.py +++ b/python/fate_test/fate_test/scripts/testsuite_cli.py @@ -39,9 +39,12 @@ @click.command("suite") @click.option('-i', '--include', required=True, type=click.Path(exists=True), multiple=True, metavar="", - help="include *testsuite.json under these paths") + help="include *testsuite.yaml under these paths") @click.option('-e', '--exclude', type=click.Path(exists=True), multiple=True, - help="exclude *testsuite.json under these paths") + help="exclude *testsuite.yaml under these paths") +@click.option('-p', '--task-cores', type=int, help="processors per node") +@click.option('-m', '--timeout', type=int, + help="maximum running time of job") @click.option("-g", '--glob', type=str, help="glob string to filter sub-directory of path specified by ") @click.option("--skip-jobs", is_flag=True, default=False, @@ -57,7 +60,7 @@ @SharedOptions.get_shared_options(hidden=True) @click.pass_context def run_suite(ctx, include, exclude, glob, - skip_jobs, skip_data, data_only, clean_data, provider, **kwargs): + skip_jobs, skip_data, data_only, clean_data, provider, task_cores, timeout, **kwargs): """ process testsuite """ @@ -66,6 +69,11 @@ def run_suite(ctx, include, exclude, glob, config_inst = ctx.obj["config"] if ctx.obj["extend_sid"] is not None: config_inst.extend_sid = ctx.obj["extend_sid"] + if task_cores is not None: + config_inst.update_conf(task_cores=task_cores) + if timeout is not None: + config_inst.update_conf(timeout=timeout) + """if ctx.obj["auto_increasing_sid"] is not None: config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"]""" if clean_data is None: diff --git a/python/fate_test/fate_test/utils.py b/python/fate_test/fate_test/utils.py index 74775354ee..a66508dd1e 100644 --- a/python/fate_test/fate_test/utils.py +++ b/python/fate_test/fate_test/utils.py @@ -369,3 +369,30 @@ def extract_data(df, col_name, convert_float=True, keep_id=False): return df[[df.columns[0], col_name]].to_numpy() else: return df[col_name].to_numpy().astype(np.float64) + + +def parse_job_time_info(job_time_info): + time_info_summary = [] + for cpn in job_time_info: + cpn_name = cpn.get("task_name") + cpn_elapsed = cpn.get("elapsed") + time_info_summary.append((cpn_name, cpn_elapsed)) + return time_info_summary + + +def pretty_time_info_summary(time_info_summary, job_name): + table = PrettyTable() + table.set_style(ORGMODE) + field_names = ["component name", "time consuming"] + table.field_names = field_names + time_summary = time_info_summary.get("time_summary", []) + for cpn_name, cpn_elapse in time_summary: + table.add_row( + [ + f"{TxtStyle.FIELD_VAL}{cpn_name}{TxtStyle.END}", + f"{TxtStyle.FIELD_VAL}{cpn_elapse}{TxtStyle.END}", + ] + ) + + return table.get_string(title=f"{TxtStyle.TITLE}Component Time Summary: " + f"{job_name}({time_info_summary['job_id']}){TxtStyle.END}") From d5f3ff50054771a3cf32671111a198183b758f8a Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Fri, 4 Aug 2023 19:20:31 +0800 Subject: [PATCH 14/30] add examples(#5008) Signed-off-by: Yu Wu --- examples/benchmark_quality/linr/fate-linr.py | 5 ++ examples/benchmark_quality/linr/local-linr.py | 72 +++++++++++++++++++ examples/pipeline/test_upload.py | 2 - 3 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 examples/benchmark_quality/linr/local-linr.py diff --git a/examples/benchmark_quality/linr/fate-linr.py b/examples/benchmark_quality/linr/fate-linr.py index cb7866ad8d..248b0afc79 100644 --- a/examples/benchmark_quality/linr/fate-linr.py +++ b/examples/benchmark_quality/linr/fate-linr.py @@ -82,6 +82,11 @@ def main(config="../../config.yaml", param="./linr_config.yaml", namespace=""): # pipeline.add_task(linr_1) pipeline.add_task(evaluation_0) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + pipeline.compile() print(pipeline.get_dag()) pipeline.fit() diff --git a/examples/benchmark_quality/linr/local-linr.py b/examples/benchmark_quality/linr/local-linr.py new file mode 100644 index 0000000000..bffafbb524 --- /dev/null +++ b/examples/benchmark_quality/linr/local-linr.py @@ -0,0 +1,72 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import os + +import numpy as np +import pandas +from fate_client.pipeline.utils.test_utils import JobConfig +from sklearn.linear_model import SGDRegressor +from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score + + +def main(config="../../config.yaml", param="./linr_sklearn_config.yaml"): + # obtain config + if isinstance(param, str): + param = JobConfig.load_from_file(param) + data_guest = param["data_guest"] + data_host = param["data_host"] + idx = param["idx"] + label_name = param["label_name"] + + if isinstance(config, str): + config = JobConfig.load_from_file(config) + print(f"config: {config}") + data_base_dir = config["data_base_dir"] + else: + data_base_dir = config.data_base_dir + + # prepare data + df_guest = pandas.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) + df_host = pandas.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) + df = df_guest.join(df_host, rsuffix="host") + y = df[label_name] + X = df.drop(label_name, axis=1) + lm = SGDRegressor(loss="squared_error", penalty=param["penalty"], random_state=42, + fit_intercept=True, max_iter=param["epochs"], average=param["batch_size"]) + lm_fit = lm.fit(X, y) + y_pred = lm_fit.predict(X) + + mse = mean_squared_error(y, y_pred) + rmse = np.sqrt(mse) + r2 = r2_score(y, y_pred) + explained_var = explained_variance_score(y, y_pred) + metric_summary = {"r2_score": r2, + "mse": mse, + "rmse": rmse} + data_summary = {} + return data_summary, metric_summary + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("BENCHMARK-QUALITY LOCAL JOB") + parser.add_argument("-c", "--config", type=str, + help="config file", default="../../config.yaml") + parser.add_argument("-p", "--param", type=str, + help="config file for params", default="./linr_sklearn_config.yaml") + args = parser.parse_args() + main(args.config, args.param) diff --git a/examples/pipeline/test_upload.py b/examples/pipeline/test_upload.py index c44261de8a..403926bbe6 100644 --- a/examples/pipeline/test_upload.py +++ b/examples/pipeline/test_upload.py @@ -31,7 +31,6 @@ 'weight_type': 'float32'} pipeline.transform_local_file_to_dataframe( # file="${abs_path_of_data_guest}", - file="/Users/yuwu/PycharmProjects/FATE/examples/data/breast_hetero_guest.csv", meta=meta, head=True, namespace="experiment", name="breast_hetero_guest") @@ -53,7 +52,6 @@ pipeline.set_site_party_id("0") pipeline.transform_local_file_to_dataframe( # file="${abs_path_of_data_host}", - file="/Users/yuwu/PycharmProjects/FATE/examples/data/breast_hetero_host.csv", meta=meta, head=True, namespace="experiment", name="breast_hetero_host") From fe4b99610ff1809cb7836903f28b37a944f8fe88 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Mon, 7 Aug 2023 19:06:18 +0800 Subject: [PATCH 15/30] fixed cpn order in fate-test performance comparison table(#5008) fix fate-test performance param parsing(#5008) Signed-off-by: Yu Wu --- .../benchmark_quality/lr/breast_config.yaml | 16 ++++---- python/fate_test/fate_test/_flow_client.py | 2 +- .../fate_test/scripts/benchmark_cli.py | 4 +- .../fate_test/scripts/performance_cli.py | 40 +++++++++---------- python/fate_test/fate_test/utils.py | 20 ++++++---- 5 files changed, 43 insertions(+), 39 deletions(-) diff --git a/examples/benchmark_quality/lr/breast_config.yaml b/examples/benchmark_quality/lr/breast_config.yaml index 142f056628..4feac1af67 100644 --- a/examples/benchmark_quality/lr/breast_config.yaml +++ b/examples/benchmark_quality/lr/breast_config.yaml @@ -2,21 +2,21 @@ data_guest: "breast_hetero_guest" data_host: "breast_hetero_host" idx: "id" label_name: "y" -epochs: 15 +epochs: 8 init_param: fit_intercept: True - method: "uniform" + method: "random_uniform" random_state: 42 learning_rate_scheduler: method: "constant" scheduler_params: - factor: 1.0 - total_iters: 100 + factor: 0.2 + total_iters: 18 optimizer: method: "rmsprop" - penalty: "l1" + penalty: "l2" optimizer_params: - lr: 0.5 - alpha: 0.1 -batch_size: null + lr: 0.15 + alpha: 0.2 +batch_size: 240 early_stop: "diff" \ No newline at end of file diff --git a/python/fate_test/fate_test/_flow_client.py b/python/fate_test/fate_test/_flow_client.py index e8aa76fdbc..098c6a1bd9 100644 --- a/python/fate_test/fate_test/_flow_client.py +++ b/python/fate_test/fate_test/_flow_client.py @@ -280,7 +280,7 @@ def get_version(self): retmsg = response['message'] if retcode != 0 or retmsg != 'success': raise RuntimeError(f"get version error: {response}") - fate_version = response["data"]["provider_name"] + fate_version = response["data"][0]["provider_name"] except Exception as e: raise RuntimeError(f"get version error: {response}") from e return fate_version diff --git a/python/fate_test/fate_test/scripts/benchmark_cli.py b/python/fate_test/fate_test/scripts/benchmark_cli.py index 4c5d84b8ee..365484a74e 100644 --- a/python/fate_test/fate_test/scripts/benchmark_cli.py +++ b/python/fate_test/fate_test/scripts/benchmark_cli.py @@ -77,9 +77,7 @@ def run_benchmark(ctx, include, exclude, glob, skip_data, tol, clean_data, stora if not yes and not click.confirm("running?"): return client = Clients(config_inst) - # @todo: change to client query result - # fate_version = client["guest_0"].get_version() - fate_version = "beta-2.0.0" + fate_version = client["guest_0"].get_version() for i, suite in enumerate(suites): # noinspection PyBroadException try: diff --git a/python/fate_test/fate_test/scripts/performance_cli.py b/python/fate_test/fate_test/scripts/performance_cli.py index 0f120de1fd..dbd981ac5a 100644 --- a/python/fate_test/fate_test/scripts/performance_cli.py +++ b/python/fate_test/fate_test/scripts/performance_cli.py @@ -19,7 +19,6 @@ import uuid from datetime import timedelta from inspect import signature -from ruamel import yaml import click from fate_test._client import Clients @@ -27,10 +26,10 @@ from fate_test._io import LOGGER, echo from fate_test._parser import PerformanceSuite from fate_test.scripts._options import SharedOptions -from fate_test.scripts._utils import _load_testsuites, _upload_data, _delete_data, _load_module_from_script, \ - _add_replace_hook +from fate_test.scripts._utils import _load_testsuites, _upload_data, _delete_data, _load_module_from_script from fate_test.utils import TxtStyle, parse_job_time_info, pretty_time_info_summary from prettytable import PrettyTable, ORGMODE +from ruamel import yaml @click.command("performance") @@ -56,7 +55,7 @@ @click.option("--disable-clean-data", "clean_data", flag_value=False, default=None) @SharedOptions.get_shared_options(hidden=True) @click.pass_context -def run_task(ctx, job_type, include, replace, timeout, epochs, +def run_task(ctx, job_type, include, timeout, epochs, max_depth, num_trees, task_cores, storage_tag, history_tag, skip_data, clean_data, provider, **kwargs): """ Test the performance of big data tasks, alias: bp @@ -79,13 +78,13 @@ def run_task(ctx, job_type, include, replace, timeout, epochs, clean_data = config_inst.clean_data def get_perf_template(conf: Config, job_type): - perf_dir = os.path.join(os.path.abspath(conf.perf_template_dir) + '/' + job_type + '/' + "*testsuite.yaml") + perf_dir = os.path.join(os.path.abspath(conf.perf_template_dir) + '/' + job_type + '/' + "*performance.yaml") return glob.glob(perf_dir) if not include: include = get_perf_template(config_inst, job_type) # prepare output dir and json hooks - _add_replace_hook(replace) + # _add_replace_hook(replace) echo.welcome() echo.echo(f"testsuite namespace: {namespace}", fg='red') @@ -93,7 +92,7 @@ def get_perf_template(conf: Config, job_type): suites = _load_testsuites(includes=include, excludes=tuple(), glob=None, provider=provider, suffix="performance.yaml", suite_type="performance") for i, suite in enumerate(suites): - echo.echo(f"\tdataset({len(suite.dataset)}) dsl jobs({len(suite.jobs)}) {suite.path}") + echo.echo(f"\tdataset({len(suite.dataset)}) pipeline jobs({len(suite.pipeline_jobs)}) {suite.path}") if not yes and not click.confirm("running?"): return @@ -115,7 +114,8 @@ def get_perf_template(conf: Config, job_type): echo.stdout_newline() try: - job_time_info = _run_performance_jobs(config_inst, suite, namespace, data_namespace_mangling, client, + job_time_info = _run_performance_jobs(config_inst, suite, namespace, data_namespace_mangling, + client, epochs, max_depth, num_trees) except Exception as e: raise RuntimeError(f"exception occur while running pipeline jobs for {suite.path}") from e @@ -130,9 +130,8 @@ def get_perf_template(conf: Config, job_type): performance_dir = "/".join( [os.path.join(os.path.abspath(config_inst.cache_directory), 'benchmark_history', "performance.yaml")]) - # @todo: change to client query result - # fate_version = clients["guest_0"].get_version() - fate_version = "beta-2.0.0" + fate_version = client["guest_0"].get_version() + # fate_version = "beta-2.0.0" if history_tag: history_tag = ["_".join([i, job_name]) for i in history_tag] history_compare_result = comparison_quality(job_name, @@ -149,8 +148,6 @@ def get_perf_template(conf: Config, job_type): echo.echo("#" * 60) echo.echo("\n".join(compare_summary)) - echo.echo() - except Exception: exception_id = uuid.uuid1() echo.echo(f"exception in {suite.path}, exception_id={exception_id}") @@ -163,7 +160,7 @@ def get_perf_template(conf: Config, job_type): @LOGGER.catch -def _run_performance_jobs(config: Config, suite: PerformanceSuite, tol: float, namespace: str, +def _run_performance_jobs(config: Config, suite: PerformanceSuite, namespace: str, data_namespace_mangling: bool, client, epochs, max_depth, num_trees): # pipeline demo goes here job_n = len(suite.pipeline_jobs) @@ -199,7 +196,9 @@ def _run_performance_jobs(config: Config, suite: PerformanceSuite, tol: float, n else: job_id = mod.main() echo.echo(f"[{j + 1}/{job_n}] job: {job.job_name} Success!\n") - ret_msg = client.query_time_elapse(job_id, role="guest", party_id=config.parties.guest[0]).get("data") + ret_msg = client["guest_0"].query_job(job_id=job_id, + role="guest", + party_id=config.parties.guest[0]).get("data") time_summary = parse_job_time_info(ret_msg) job_time_history[job_name] = {"job_id": job_id, "time_summary": time_summary} echo.echo(f"[{j + 1}/{job_n}] job: {job.job_name} time info: {time_summary}\n") @@ -216,7 +215,7 @@ def _run_performance_jobs(config: Config, suite: PerformanceSuite, tol: float, n def comparison_quality(group_name, history_tags, history_info_dir, time_consuming): assert os.path.exists(history_info_dir), f"Please check the {history_info_dir} Is it deleted" with open(history_info_dir, 'r') as f: - benchmark_quality = yaml.load(f) + benchmark_quality = yaml.safe_load(f) benchmark_performance = {} table = PrettyTable() table.set_style(ORGMODE) @@ -229,10 +228,11 @@ def comparison_quality(group_name, history_tags, history_info_dir, time_consumin benchmark_performance[group_name] = time_consuming for script_model_name in benchmark_performance: - for cpn, time in benchmark_performance[script_model_name].items(): + time_history = benchmark_performance[script_model_name] + for cpn in time_history.get("cpn_list"): table.add_row([f"{script_model_name}"] + - [f"{TxtStyle.FIELD_VAL}{cpn}{TxtStyle.END}"] + - [f"{TxtStyle.FIELD_VAL}{time}{TxtStyle.END}"]) + [f"{cpn}"] + + [f"{TxtStyle.FIELD_VAL}{timedelta(seconds=time_history.get(cpn))}{TxtStyle.END}"]) # print("\n") # print(table.get_string(title=f"{TxtStyle.TITLE}Performance comparison results{TxtStyle.END}")) # print("#" * 60) @@ -243,7 +243,7 @@ def save_quality(storage_tag, save_dir, time_consuming): os.makedirs(os.path.dirname(save_dir), exist_ok=True) if os.path.exists(save_dir): with open(save_dir, 'r') as f: - benchmark_quality = yaml.load(f) + benchmark_quality = yaml.safe_load(f) else: benchmark_quality = {} benchmark_quality.update({storage_tag: time_consuming}) diff --git a/python/fate_test/fate_test/utils.py b/python/fate_test/fate_test/utils.py index a66508dd1e..2176109e2c 100644 --- a/python/fate_test/fate_test/utils.py +++ b/python/fate_test/fate_test/utils.py @@ -16,6 +16,7 @@ import math import os +from datetime import timedelta import numpy as np from colorama import init, deinit, Fore, Style @@ -372,11 +373,15 @@ def extract_data(df, col_name, convert_float=True, keep_id=False): def parse_job_time_info(job_time_info): - time_info_summary = [] + time_info_summary = {} + cpn_list = [] for cpn in job_time_info: cpn_name = cpn.get("task_name") - cpn_elapsed = cpn.get("elapsed") - time_info_summary.append((cpn_name, cpn_elapsed)) + # convert milliseconds to seconds + cpn_elapsed = round(cpn.get("elapsed") / 1000) + time_info_summary[cpn_name] = cpn_elapsed + cpn_list.append(cpn_name) + time_info_summary["cpn_list"] = cpn_list return time_info_summary @@ -385,12 +390,13 @@ def pretty_time_info_summary(time_info_summary, job_name): table.set_style(ORGMODE) field_names = ["component name", "time consuming"] table.field_names = field_names - time_summary = time_info_summary.get("time_summary", []) - for cpn_name, cpn_elapse in time_summary: + time_summary = time_info_summary.get("time_summary", {}) + for cpn_name in time_summary["cpn_list"]: + cpn_elapse = time_summary.get(cpn_name) table.add_row( [ - f"{TxtStyle.FIELD_VAL}{cpn_name}{TxtStyle.END}", - f"{TxtStyle.FIELD_VAL}{cpn_elapse}{TxtStyle.END}", + f"{cpn_name}", + f"{TxtStyle.FIELD_VAL}{timedelta(seconds=cpn_elapse)}{TxtStyle.END}", ] ) From 5713ee76ae90d9dc762360d138e5f01680ac9b84 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Wed, 9 Aug 2023 10:51:47 +0800 Subject: [PATCH 16/30] fix l1 penalty of optimizer(#4659) edit fate-test performance examples(#5008) Signed-off-by: Yu Wu --- .../coordinated_lr/test_lr_sid.py | 17 ++--------------- .../benchmark_quality/lr/breast_config.yaml | 10 +++++----- .../benchmark_quality/lr/lr_benchmark.yaml | 18 +++++++++--------- .../benchmark_quality/lr/sklearn-lr-binary.py | 4 ++-- .../fate/ml/glm/hetero/coordinated_lr/guest.py | 2 +- .../fate/ml/glm/hetero/coordinated_lr/host.py | 3 ++- python/fate/ml/utils/_optimizer.py | 6 +++--- 7 files changed, 24 insertions(+), 36 deletions(-) diff --git a/examples/benchmark_performance/coordinated_lr/test_lr_sid.py b/examples/benchmark_performance/coordinated_lr/test_lr_sid.py index ebe2b289e0..fc3f69209a 100644 --- a/examples/benchmark_performance/coordinated_lr/test_lr_sid.py +++ b/examples/benchmark_performance/coordinated_lr/test_lr_sid.py @@ -37,21 +37,8 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): assert isinstance(param, dict) - data_set = param.get("data_guest").split('/')[-1] - if data_set == "default_credit_hetero_guest.csv": - guest_data_table = 'default_credit_hetero_guest' - host_data_table = 'default_credit_hetero_host' - elif data_set == 'breast_hetero_guest.csv': - guest_data_table = 'breast_hetero_guest' - host_data_table = 'breast_hetero_host' - elif data_set == 'give_credit_hetero_guest.csv': - guest_data_table = 'give_credit_hetero_guest' - host_data_table = 'give_credit_hetero_host' - elif data_set == 'epsilon_5k_hetero_guest.csv': - guest_data_table = 'epsilon_5k_hetero_guest' - host_data_table = 'epsilon_5k_hetero_host' - else: - raise ValueError(f"Cannot recognized data_set: {data_set}") + guest_data_table = param.get("data_guest") + host_data_table = param.get("data_host") guest_train_data = {"name": guest_data_table, "namespace": f"experiment{namespace}"} host_train_data = {"name": host_data_table, "namespace": f"experiment{namespace}"} diff --git a/examples/benchmark_quality/lr/breast_config.yaml b/examples/benchmark_quality/lr/breast_config.yaml index 4feac1af67..a3bef0a73c 100644 --- a/examples/benchmark_quality/lr/breast_config.yaml +++ b/examples/benchmark_quality/lr/breast_config.yaml @@ -2,7 +2,7 @@ data_guest: "breast_hetero_guest" data_host: "breast_hetero_host" idx: "id" label_name: "y" -epochs: 8 +epochs: 20 init_param: fit_intercept: True method: "random_uniform" @@ -10,13 +10,13 @@ init_param: learning_rate_scheduler: method: "constant" scheduler_params: - factor: 0.2 - total_iters: 18 + factor: 1.0 + total_iters: 5 optimizer: method: "rmsprop" penalty: "l2" optimizer_params: - lr: 0.15 - alpha: 0.2 + lr: 0.12 + alpha: 0.1 batch_size: 240 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/lr_benchmark.yaml b/examples/benchmark_quality/lr/lr_benchmark.yaml index c857d20e3f..0dd056766e 100644 --- a/examples/benchmark_quality/lr/lr_benchmark.yaml +++ b/examples/benchmark_quality/lr/lr_benchmark.yaml @@ -179,15 +179,15 @@ data: table_name: vehicle_scale_hetero_host namespace: experiment role: host_0 -hetero_lr-binary-0-breast: - local: - script: "./sklearn-lr-binary.py" - conf: "./breast_lr_sklearn_config.yaml" - FATE-hetero-lr: - script: "./pipeline-lr-binary.py" - conf: "./breast_config.yaml" - compare_setting: - relative_tol: 0.01 +#hetero_lr-binary-0-breast: +# local: +# script: "./sklearn-lr-binary.py" +# conf: "./breast_lr_sklearn_config.yaml" +# FATE-hetero-lr: +# script: "./pipeline-lr-binary.py" +# conf: "./breast_config.yaml" +# compare_setting: +# relative_tol: 0.01 #hetero_lr-binary-1-default-credit: # local: # script: "./sklearn-lr-binary.py" diff --git a/examples/benchmark_quality/lr/sklearn-lr-binary.py b/examples/benchmark_quality/lr/sklearn-lr-binary.py index e418a1297a..94ac82fe18 100644 --- a/examples/benchmark_quality/lr/sklearn-lr-binary.py +++ b/examples/benchmark_quality/lr/sklearn-lr-binary.py @@ -42,7 +42,7 @@ def main(config="../../config.yaml", param="./breast_lr_sklearn_config.yaml"): config_param = { "penalty": param["penalty"], - "max_iter": 100, + "max_iter": param["epochs"], "alpha": param["alpha"], "learning_rate": "optimal", "eta0": param["eta0"], @@ -76,7 +76,7 @@ def main(config="../../config.yaml", param="./breast_lr_sklearn_config.yaml"): fpr, tpr, thresholds = roc_curve(y_test, y_prob) ks = max(tpr - fpr) - result = {"auc": auc_score, "binary_recall": recall, "binary_precision": pr, "binary_accuracy": acc} + result = {"auc": auc_score, "recall": recall, "binary": pr, "accuracy": acc} print(result) print(f"coef_: {lm_fit.coef_}, intercept_: {lm_fit.intercept_}, n_iter: {lm_fit.n_iter_}") return {}, result diff --git a/python/fate/ml/glm/hetero/coordinated_lr/guest.py b/python/fate/ml/glm/hetero/coordinated_lr/guest.py index 5adc10b73b..5164bb7d5c 100644 --- a/python/fate/ml/glm/hetero/coordinated_lr/guest.py +++ b/python/fate/ml/glm/hetero/coordinated_lr/guest.py @@ -365,7 +365,7 @@ def predict(self, ctx, test_data): test_data["intercept"] = 1.0 X = test_data.values.as_tensor() # logger.info(f"in predict, w: {self.w}") - pred = torch.matmul(X, self.w) + pred = torch.matmul(X, self.w.detach()) for h_pred in ctx.hosts.get("h_pred"): pred += h_pred pred = torch.sigmoid(pred) diff --git a/python/fate/ml/glm/hetero/coordinated_lr/host.py b/python/fate/ml/glm/hetero/coordinated_lr/host.py index 78e52e85ed..cb49a5a954 100644 --- a/python/fate/ml/glm/hetero/coordinated_lr/host.py +++ b/python/fate/ml/glm/hetero/coordinated_lr/host.py @@ -200,10 +200,11 @@ def asynchronous_compute_gradient(self, batch_ctx, encryptor, w, X): batch_ctx.guest.put("Xw2_h", encryptor.encrypt(torch.matmul(Xw_h.T, Xw_h))) loss_norm = self.optimizer.loss_norm(w) + if loss_norm is not None: batch_ctx.guest.put("h_loss", encryptor.encrypt(loss_norm)) else: - batch_ctx.guest.put(h_loss=loss_norm) + batch_ctx.guest.put("h_loss", loss_norm) g = 1 / h * (half_g + guest_half_g) return g diff --git a/python/fate/ml/utils/_optimizer.py b/python/fate/ml/utils/_optimizer.py index b0833163d2..e7868e85d6 100644 --- a/python/fate/ml/utils/_optimizer.py +++ b/python/fate/ml/utils/_optimizer.py @@ -158,8 +158,8 @@ def _l1_updator(self, model_weights, gradient, fit_intercept, lr): ) if fit_intercept: - new_weights = torch.concat((new_weights, model_weights.intercept_)) - new_weights[-1] -= gradient[-1] + new_intercept = model_weights[-1] - gradient[-1] + new_weights = torch.concat((new_weights, new_intercept.reshape((1, 1)))) return new_weights @@ -213,7 +213,7 @@ def regularization_update( def __l1_loss_norm(self, model_weights): loss_norm = torch.sum(self.alpha * model_weights) - return loss_norm + return loss_norm.reshape((1, 1)) def __l2_loss_norm(self, model_weights): loss_norm = 0.5 * self.alpha * \ From 97a903a967561f133c46c8f1513f1a1215b87658 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Wed, 9 Aug 2023 15:14:24 +0800 Subject: [PATCH 17/30] edit fate-test performance examples(#5008) Signed-off-by: Yu Wu --- .../benchmark_quality/lr/default_credit_config.yaml | 8 ++++---- examples/benchmark_quality/lr/epsilon_5k_config.yaml | 8 ++++---- .../lr/epsilon_5k_lr_sklearn_config.yaml | 4 ++-- examples/benchmark_quality/lr/give_credit_config.yaml | 11 ++++++----- examples/benchmark_quality/lr/sklearn-lr-binary.py | 2 +- python/fate_test/fate_test/scripts/data_cli.py | 8 ++++---- 6 files changed, 21 insertions(+), 20 deletions(-) diff --git a/examples/benchmark_quality/lr/default_credit_config.yaml b/examples/benchmark_quality/lr/default_credit_config.yaml index c45ef53d8a..8033d8af0d 100644 --- a/examples/benchmark_quality/lr/default_credit_config.yaml +++ b/examples/benchmark_quality/lr/default_credit_config.yaml @@ -2,16 +2,16 @@ data_guest: "default_credit_hetero_guest" data_host: "default_credit_hetero_host" idx: "id" label_name: "y" -epochs: 30 +epochs: 22 init_param: fit_intercept: True method: "zeros" random_state: 42 learning_rate_scheduler: - method: "constant" + method: "linear" scheduler_params: - factor: 1.0 - total_iters: 10000 + start_factor: 0.7 + total_iters: 1000 optimizer: method: "rmsprop" penalty: "L2" diff --git a/examples/benchmark_quality/lr/epsilon_5k_config.yaml b/examples/benchmark_quality/lr/epsilon_5k_config.yaml index 6822e02ea7..39144f4fdb 100644 --- a/examples/benchmark_quality/lr/epsilon_5k_config.yaml +++ b/examples/benchmark_quality/lr/epsilon_5k_config.yaml @@ -2,21 +2,21 @@ data_guest: "epsilon_5k_hetero_guest" data_host: "epsilon_5k_hetero_host" idx: "id" label_name: "y" -epochs: 30 +epochs: 8 batch_size: 2500 init_param: fit_intercept: True method: "random" random_state: 42 learning_rate_scheduler: - method: "constant" + method: "linear" scheduler_params: - factor: 1.0 + start_factor: 0.7 total_iters: 1000 optimizer: method: "adam" penalty: "L2" alpha: 0.0001 optimizer_params: - lr: 0.3 + lr: 0.43 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/epsilon_5k_lr_sklearn_config.yaml b/examples/benchmark_quality/lr/epsilon_5k_lr_sklearn_config.yaml index bef62e89aa..7559f0bfa6 100644 --- a/examples/benchmark_quality/lr/epsilon_5k_lr_sklearn_config.yaml +++ b/examples/benchmark_quality/lr/epsilon_5k_lr_sklearn_config.yaml @@ -2,10 +2,10 @@ data_guest: "examples/data/epsilon_5k_hetero_guest.csv" data_host: "examples/data/epsilon_5k_hetero_host.csv" idx: "id" label_name: "y" -epochs: 30 +epochs: 10 fit_intercept: True method: "rmsprop" penalty: "L2" eta0: 0.1 -alpha: 0.5 +alpha: 0.001 batch_size: 5000 \ No newline at end of file diff --git a/examples/benchmark_quality/lr/give_credit_config.yaml b/examples/benchmark_quality/lr/give_credit_config.yaml index 73f2285fa1..dc041b48fe 100644 --- a/examples/benchmark_quality/lr/give_credit_config.yaml +++ b/examples/benchmark_quality/lr/give_credit_config.yaml @@ -7,14 +7,15 @@ init_param: fit_intercept: True method: "zeros" learning_rate_scheduler: - method: "constant" + method: "linear" scheduler_params: - factor: 1.0 - total_iters: 100 + factor: 0.7 + total_iters: 1000 optimizer: method: "adam" penalty: "L2" + alpha: 10 optimizer_params: - lr: 0.15 -batch_size: 550 + lr: 0.2 +batch_size: 5500 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/sklearn-lr-binary.py b/examples/benchmark_quality/lr/sklearn-lr-binary.py index 94ac82fe18..2a2710be2f 100644 --- a/examples/benchmark_quality/lr/sklearn-lr-binary.py +++ b/examples/benchmark_quality/lr/sklearn-lr-binary.py @@ -76,7 +76,7 @@ def main(config="../../config.yaml", param="./breast_lr_sklearn_config.yaml"): fpr, tpr, thresholds = roc_curve(y_test, y_prob) ks = max(tpr - fpr) - result = {"auc": auc_score, "recall": recall, "binary": pr, "accuracy": acc} + result = {"auc": auc_score, "recall": recall, "binary_precision": pr, "accuracy": acc} print(result) print(f"coef_: {lm_fit.coef_}, intercept_: {lm_fit.intercept_}, n_iter: {lm_fit.n_iter_}") return {}, result diff --git a/python/fate_test/fate_test/scripts/data_cli.py b/python/fate_test/fate_test/scripts/data_cli.py index 01d8c19c33..7a09980dd2 100644 --- a/python/fate_test/fate_test/scripts/data_cli.py +++ b/python/fate_test/fate_test/scripts/data_cli.py @@ -51,8 +51,8 @@ def upload(ctx, include, exclude, glob, suite_type, role, config_type, **kwargs) config_inst = ctx.obj["config"] if ctx.obj["extend_sid"] is not None: config_inst.extend_sid = ctx.obj["extend_sid"] - if ctx.obj["auto_increasing_sid"] is not None: - config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"] + """if ctx.obj["auto_increasing_sid"] is not None: + config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"]""" yes = ctx.obj["yes"] echo.welcome() echo.echo(f"testsuite namespace: {namespace}", fg='red') @@ -176,8 +176,8 @@ def generate(ctx, include, host_data_type, encryption_type, match_rate, sparsity config_inst = ctx.obj["config"] if ctx.obj["extend_sid"] is not None: config_inst.extend_sid = ctx.obj["extend_sid"] - if ctx.obj["auto_increasing_sid"] is not None: - config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"] + """if ctx.obj["auto_increasing_sid"] is not None: + config_inst.auto_increasing_sid = ctx.obj["auto_increasing_sid"]""" if parallelize and upload_data: upload_data = False yes = ctx.obj["yes"] From 2db064db9f8b49db8eadd1b14141d8a96d2ceab9 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Wed, 9 Aug 2023 19:26:34 +0800 Subject: [PATCH 18/30] use encrypt_tensor api for phe encryptor(#4659) edit fate_test examples(#5008) replace Intersection with PSI in examples Signed-off-by: Yu Wu --- examples/benchmark_quality/linr/fate-linr.py | 18 ++--- .../benchmark_quality/lr/breast_config.yaml | 6 +- .../benchmark_quality/lr/lr_benchmark.yaml | 72 +++++++++---------- .../lr/pipeline-lr-binary.py | 18 ++--- .../benchmark_quality/lr/pipeline-lr-multi.py | 18 ++--- .../pipeline/coordinated_lr/test_lr_sid.py | 33 ++++----- .../pipeline/coordinated_lr/test_lr_sid_cv.py | 16 ++--- .../coordinated_lr/test_lr_sid_warm_start.py | 20 +++--- .../ml/glm/hetero/coordinated_linr/host.py | 13 ++-- .../ml/glm/hetero/coordinated_lr/guest.py | 3 +- .../fate/ml/glm/hetero/coordinated_lr/host.py | 13 ++-- 11 files changed, 117 insertions(+), 113 deletions(-) diff --git a/examples/benchmark_quality/linr/fate-linr.py b/examples/benchmark_quality/linr/fate-linr.py index 248b0afc79..bc85a9363e 100644 --- a/examples/benchmark_quality/linr/fate-linr.py +++ b/examples/benchmark_quality/linr/fate-linr.py @@ -17,7 +17,7 @@ import argparse from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLinR, Intersection +from fate_client.pipeline.components.fate import CoordinatedLinR, PSI from fate_client.pipeline.components.fate import Evaluation from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils @@ -43,11 +43,11 @@ def main(config="../../config.yaml", param="./linr_config.yaml", namespace=""): pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) - intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], - namespace=guest_train_data["namespace"])) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], - namespace=host_train_data["namespace"])) + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], + namespace=guest_train_data["namespace"])) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], + namespace=host_train_data["namespace"])) linr_param = { } @@ -63,10 +63,10 @@ def main(config="../../config.yaml", param="./linr_config.yaml", namespace=""): } linr_param.update(config_param) linr_0 = CoordinatedLinR("linr_0", - train_data=intersect_0.outputs["output_data"], + train_data=psi_0.outputs["output_data"], **config_param) """linr_1 = CoordinatedLinR("linr_1", - test_data=intersect_0.outputs["output_data"], + test_data=psi_0.outputs["output_data"], input_model=linr_0.outputs["output_model"])""" evaluation_0 = Evaluation("evaluation_0", @@ -77,7 +77,7 @@ def main(config="../../config.yaml", param="./linr_config.yaml", namespace=""): "rmse"], input_data=linr_0.outputs["train_output_data"]) - pipeline.add_task(intersect_0) + pipeline.add_task(psi_0) pipeline.add_task(linr_0) # pipeline.add_task(linr_1) pipeline.add_task(evaluation_0) diff --git a/examples/benchmark_quality/lr/breast_config.yaml b/examples/benchmark_quality/lr/breast_config.yaml index a3bef0a73c..3d1747cc04 100644 --- a/examples/benchmark_quality/lr/breast_config.yaml +++ b/examples/benchmark_quality/lr/breast_config.yaml @@ -10,13 +10,13 @@ init_param: learning_rate_scheduler: method: "constant" scheduler_params: - factor: 1.0 + factor: 0.5 total_iters: 5 optimizer: method: "rmsprop" penalty: "l2" optimizer_params: - lr: 0.12 - alpha: 0.1 + lr: 0.15 + alpha: 0.01 batch_size: 240 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/lr_benchmark.yaml b/examples/benchmark_quality/lr/lr_benchmark.yaml index 0dd056766e..d7852909a2 100644 --- a/examples/benchmark_quality/lr/lr_benchmark.yaml +++ b/examples/benchmark_quality/lr/lr_benchmark.yaml @@ -179,24 +179,24 @@ data: table_name: vehicle_scale_hetero_host namespace: experiment role: host_0 -#hetero_lr-binary-0-breast: -# local: -# script: "./sklearn-lr-binary.py" -# conf: "./breast_lr_sklearn_config.yaml" -# FATE-hetero-lr: -# script: "./pipeline-lr-binary.py" -# conf: "./breast_config.yaml" -# compare_setting: -# relative_tol: 0.01 -#hetero_lr-binary-1-default-credit: -# local: -# script: "./sklearn-lr-binary.py" -# conf: "./default_credit_lr_sklearn_config.yaml" -# FATE-hetero-lr: -# script: "./pipeline-lr-binary.py" -# conf: "./default_credit_config.yaml" -# compare_setting: -# relative_tol: 0.01 +hetero_lr-binary-0-breast: + local: + script: "./sklearn-lr-binary.py" + conf: "./breast_lr_sklearn_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-binary.py" + conf: "./breast_config.yaml" + compare_setting: + relative_tol: 0.01 +hetero_lr-binary-1-default-credit: + local: + script: "./sklearn-lr-binary.py" + conf: "./default_credit_lr_sklearn_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-binary.py" + conf: "./default_credit_config.yaml" + compare_setting: + relative_tol: 0.01 hetero_lr-binary-2-epsilon-5k: local: script: "./sklearn-lr-binary.py" @@ -206,21 +206,21 @@ hetero_lr-binary-2-epsilon-5k: conf: "./epsilon_5k_config.yaml" compare_setting: relative_tol: 0.01 -#hetero_lr-binary-3-give-credit: -# local: -# script: "./sklearn-lr-binary.py" -# conf: "./give_credit_lr_sklearn_config.yaml" -# FATE-hetero-lr: -# script: "./pipeline-lr-binary.py" -# conf: "./give_credit_config.yaml" -# compare_setting: -# relative_tol: 0.01 -#multi-vehicle: -# local: -# script: "./sklearn-lr-multi.py" -# conf: "./vehicle_lr_sklearn_config.yaml" -# FATE-hetero-lr: -# script: "./pipeline-lr-multi.py" -# conf: "./vehicle_config.yaml" -# compare_setting: -# relative_tol: 0.01 +hetero_lr-binary-3-give-credit: + local: + script: "./sklearn-lr-binary.py" + conf: "./give_credit_lr_sklearn_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-binary.py" + conf: "./give_credit_config.yaml" + compare_setting: + relative_tol: 0.01 +multi-vehicle: + local: + script: "./sklearn-lr-multi.py" + conf: "./vehicle_lr_sklearn_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-multi.py" + conf: "./vehicle_config.yaml" + compare_setting: + relative_tol: 0.01 diff --git a/examples/benchmark_quality/lr/pipeline-lr-binary.py b/examples/benchmark_quality/lr/pipeline-lr-binary.py index c10dd7fcb6..9b41bbe612 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-binary.py +++ b/examples/benchmark_quality/lr/pipeline-lr-binary.py @@ -17,7 +17,7 @@ import argparse from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import CoordinatedLR, PSI from fate_client.pipeline.components.fate import Evaluation from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils @@ -45,11 +45,11 @@ def main(config="../../config.yaml", param="./breast_config.yaml", namespace="") host_train_data = {"name": host_data_table, "namespace": f"experiment{namespace}"} pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) - intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], - namespace=guest_train_data["namespace"])) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], - namespace=host_train_data["namespace"])) + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], + namespace=guest_train_data["namespace"])) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], + namespace=host_train_data["namespace"])) lr_param = { } @@ -65,10 +65,10 @@ def main(config="../../config.yaml", param="./breast_config.yaml", namespace="") } lr_param.update(config_param) lr_0 = CoordinatedLR("lr_0", - train_data=intersect_0.outputs["output_data"], + train_data=psi_0.outputs["output_data"], **lr_param) lr_1 = CoordinatedLR("lr_1", - test_data=intersect_0.outputs["output_data"], + test_data=psi_0.outputs["output_data"], input_model=lr_0.outputs["output_model"]) evaluation_0 = Evaluation("evaluation_0", @@ -77,7 +77,7 @@ def main(config="../../config.yaml", param="./breast_config.yaml", namespace="") metrics=["auc", "binary_precision", "binary_accuracy", "binary_recall"], input_data=lr_0.outputs["train_output_data"]) - pipeline.add_task(intersect_0) + pipeline.add_task(psi_0) pipeline.add_task(lr_0) pipeline.add_task(lr_1) pipeline.add_task(evaluation_0) diff --git a/examples/benchmark_quality/lr/pipeline-lr-multi.py b/examples/benchmark_quality/lr/pipeline-lr-multi.py index 3868acbd60..a598403238 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-multi.py +++ b/examples/benchmark_quality/lr/pipeline-lr-multi.py @@ -17,7 +17,7 @@ import argparse from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import CoordinatedLR, PSI from fate_client.pipeline.components.fate import Evaluation from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils @@ -44,11 +44,11 @@ def main(config="../../config.yaml", param="./vehicle_config.yaml", namespace="" host_train_data = {"name": host_data_table, "namespace": f"experiment{namespace}"} pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) - intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], - namespace=guest_train_data["namespace"])) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], - namespace=host_train_data["namespace"])) + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], + namespace=guest_train_data["namespace"])) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], + namespace=host_train_data["namespace"])) lr_param = { } @@ -64,15 +64,15 @@ def main(config="../../config.yaml", param="./vehicle_config.yaml", namespace="" } lr_param.update(config_param) lr_0 = CoordinatedLR("lr_0", - train_data=intersect_0.outputs["output_data"], + train_data=psi_0.outputs["output_data"], **config_param) lr_1 = CoordinatedLR("lr_1", - test_data=intersect_0.outputs["output_data"], + test_data=psi_0.outputs["output_data"], input_model=lr_0.outputs["output_model"]) evaluation_0 = Evaluation('evaluation_0', metrics=['multi_recall', 'multi_accuracy', 'multi_precision']) - pipeline.add_task(intersect_0) + pipeline.add_task(psi_0) pipeline.add_task(lr_0) pipeline.add_task(lr_1) pipeline.add_task(evaluation_0) diff --git a/examples/pipeline/coordinated_lr/test_lr_sid.py b/examples/pipeline/coordinated_lr/test_lr_sid.py index 9b2323fb05..b13c24f8db 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid.py +++ b/examples/pipeline/coordinated_lr/test_lr_sid.py @@ -16,7 +16,7 @@ import argparse from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import CoordinatedLR, PSI from fate_client.pipeline.components.fate import Evaluation from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils @@ -36,19 +36,19 @@ def main(config="./config.yaml", namespace=""): if config.timeout: pipeline.conf.set("timeout", config.timeout) - intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace=f"experiment{namespace}")) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) lr_0 = CoordinatedLR("lr_0", - epochs=4, + epochs=10, batch_size=None, - optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, - init_param={"fit_intercept": True, "method": "zeros"}, - train_data=intersect_0.outputs["output_data"], - learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, - "total_iters": 100}}) + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.21}}, + init_param={"fit_intercept": True, "method": "random_uniform"}, + train_data=psi_0.outputs["output_data"], + learning_rate_scheduler={"method": "linear", "scheduler_params": {"start_factor": 0.7, + "total_iters": 100}}) evaluation_0 = Evaluation("evaluation_0", label_column_name="y", @@ -56,22 +56,23 @@ def main(config="./config.yaml", namespace=""): default_eval_setting="binary", input_data=lr_0.outputs["train_output_data"]) - pipeline.add_task(intersect_0) + pipeline.add_task(psi_0) pipeline.add_task(lr_0) + pipeline.add_task(evaluation_0) pipeline.compile() print(pipeline.get_dag()) pipeline.fit() - pipeline.deploy([intersect_0, lr_0]) + pipeline.deploy([psi_0, lr_0]) predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - deployed_pipeline.intersect_0.guest.component_setting( + deployed_pipeline.psi_0.guest.component_setting( input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - deployed_pipeline.intersect_0.hosts[0].component_setting( + deployed_pipeline.psi_0.hosts[0].component_setting( input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) diff --git a/examples/pipeline/coordinated_lr/test_lr_sid_cv.py b/examples/pipeline/coordinated_lr/test_lr_sid_cv.py index 5e5a3f40bc..8caffd245b 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid_cv.py +++ b/examples/pipeline/coordinated_lr/test_lr_sid_cv.py @@ -16,7 +16,7 @@ import argparse from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import CoordinatedLR, PSI from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils @@ -34,20 +34,20 @@ def main(config="./config.yaml", namespace=""): if config.timeout: pipeline.conf.set("timeout", config.timeout) - intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace=f"experiment{namespace}")) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) lr_0 = CoordinatedLR("lr_0", epochs=2, batch_size=100, optimizer={"method": "sgd", "optimizer_params": {"lr": 0.01}}, init_param={"fit_intercept": True}, - cv_data=intersect_0.outputs["output_data"], + cv_data=psi_0.outputs["output_data"], cv_param={"n_splits": 3}) - pipeline.add_task(intersect_0) + pipeline.add_task(psi_0) pipeline.add_task(lr_0) pipeline.compile() print(pipeline.get_dag()) diff --git a/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py b/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py index 0c33c952d6..25ba007959 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py +++ b/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py @@ -16,7 +16,7 @@ import argparse from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import CoordinatedLR, PSI from fate_client.pipeline.components.fate import Evaluation from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils @@ -35,20 +35,20 @@ def main(config="./config.yaml", namespace=""): if config.timeout: pipeline.conf.set("timeout", config.timeout) - intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace=f"experiment{namespace}")) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) lr_0 = CoordinatedLR("lr_0", epochs=4, batch_size=None, optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, init_param={"fit_intercept": True, "method": "zeros"}, - train_data=intersect_0.outputs["output_data"], + train_data=psi_0.outputs["output_data"], learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, "total_iters": 100}}) - lr_1 = CoordinatedLR("lr_1", train_data=intersect_0.outputs["output_data"], + lr_1 = CoordinatedLR("lr_1", train_data=psi_0.outputs["output_data"], warm_start_model=lr_0.outputs["output_model"], epochs=2, batch_size=None, @@ -59,7 +59,7 @@ def main(config="./config.yaml", namespace=""): batch_size=None, optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, init_param={"fit_intercept": True, "method": "zeros"}, - train_data=intersect_0.outputs["output_data"], + train_data=psi_0.outputs["output_data"], learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, "total_iters": 100}}) @@ -69,7 +69,7 @@ def main(config="./config.yaml", namespace=""): default_eval_setting="binary", input_data=[lr_1.outputs["train_output_data"], lr_2.outputs["train_output_data"]]) - pipeline.add_task(intersect_0) + pipeline.add_task(psi_0) pipeline.add_task(lr_0) pipeline.add_task(lr_1) pipeline.add_task(lr_2) diff --git a/python/fate/ml/glm/hetero/coordinated_linr/host.py b/python/fate/ml/glm/hetero/coordinated_linr/host.py index 1473094c9b..a33a7019cf 100644 --- a/python/fate/ml/glm/hetero/coordinated_linr/host.py +++ b/python/fate/ml/glm/hetero/coordinated_linr/host.py @@ -15,6 +15,7 @@ import logging import torch + from fate.arch import Context from fate.arch.dataframe import DataLoader from fate.ml.abc.module import HeteroModule @@ -121,15 +122,15 @@ def __init__(self, epochs=None, batch_size=None, optimizer=None, learning_rate_s def asynchronous_compute_gradient(self, batch_ctx, encryptor, w, X): h = X.shape[0] Xw_h = torch.matmul(X, w.detach()) - batch_ctx.guest.put("Xw_h", encryptor.encrypt(Xw_h)) + batch_ctx.guest.put("Xw_h", encryptor.encrypt_tensor(Xw_h)) half_g = torch.matmul(X.T, Xw_h) guest_half_d = batch_ctx.guest.get("half_d") guest_half_g = torch.matmul(X.T, guest_half_d) - batch_ctx.guest.put("Xw2_h", encryptor.encrypt(torch.matmul(Xw_h.T, Xw_h))) + batch_ctx.guest.put("Xw2_h", encryptor.encrypt_tensor(torch.matmul(Xw_h.T, Xw_h))) loss_norm = self.optimizer.loss_norm(w) if loss_norm is not None: - batch_ctx.guest.put("h_loss", encryptor.encrypt(loss_norm)) + batch_ctx.guest.put("h_loss", encryptor.encrypt_tensor(loss_norm)) else: batch_ctx.guest.put(h_loss=loss_norm) @@ -139,12 +140,12 @@ def asynchronous_compute_gradient(self, batch_ctx, encryptor, w, X): def centralized_compute_gradient(self, batch_ctx, encryptor, w, X): h = X.shape[0] Xw_h = torch.matmul(X, w.detach()) - batch_ctx.guest.put("Xw_h", encryptor.encrypt(Xw_h)) - batch_ctx.guest.put("Xw2_h", encryptor.encrypt(torch.matmul(Xw_h.T, Xw_h))) + batch_ctx.guest.put("Xw_h", encryptor.encrypt_tensor(Xw_h)) + batch_ctx.guest.put("Xw2_h", encryptor.encrypt_tensor(torch.matmul(Xw_h.T, Xw_h))) loss_norm = self.optimizer.loss_norm(w) if loss_norm is not None: - batch_ctx.guest.put("h_loss", encryptor.encrypt(loss_norm)) + batch_ctx.guest.put("h_loss", encryptor.encrypt_tensor(loss_norm)) else: batch_ctx.guest.put(h_loss=loss_norm) diff --git a/python/fate/ml/glm/hetero/coordinated_lr/guest.py b/python/fate/ml/glm/hetero/coordinated_lr/guest.py index ea4d6cb662..1969bba4d2 100644 --- a/python/fate/ml/glm/hetero/coordinated_lr/guest.py +++ b/python/fate/ml/glm/hetero/coordinated_lr/guest.py @@ -16,6 +16,7 @@ import logging import torch + from fate.arch import Context, dataframe from fate.ml.abc.module import HeteroModule from fate.ml.utils import predict_tools @@ -247,7 +248,7 @@ def asynchronous_compute_gradient(self, batch_ctx, encryptor, w, X, Y, weight): half_d = 0.25 * Xw - 0.5 * Y if weight: half_d = half_d * weight - batch_ctx.hosts.put("half_d", encryptor.encrypt(half_d)) + batch_ctx.hosts.put("half_d", encryptor.encrypt_tensor(half_d)) half_g = torch.matmul(X.T, half_d) Xw_h = batch_ctx.hosts.get("Xw_h")[0] diff --git a/python/fate/ml/glm/hetero/coordinated_lr/host.py b/python/fate/ml/glm/hetero/coordinated_lr/host.py index 5395f2ab36..d1957e69ac 100644 --- a/python/fate/ml/glm/hetero/coordinated_lr/host.py +++ b/python/fate/ml/glm/hetero/coordinated_lr/host.py @@ -15,6 +15,7 @@ import logging import torch + from fate.arch import Context from fate.arch.dataframe import DataLoader from fate.ml.abc.module import HeteroModule @@ -205,17 +206,17 @@ def __init__(self, epochs=None, batch_size=None, optimizer=None, learning_rate_s def asynchronous_compute_gradient(self, batch_ctx, encryptor, w, X): h = X.shape[0] Xw_h = 0.25 * torch.matmul(X, w.detach()) - batch_ctx.guest.put("Xw_h", encryptor.encrypt(Xw_h)) + batch_ctx.guest.put("Xw_h", encryptor.encrypt_tensor(Xw_h)) half_g = torch.matmul(X.T, Xw_h) guest_half_d = batch_ctx.guest.get("half_d") guest_half_g = torch.matmul(X.T, guest_half_d) - batch_ctx.guest.put("Xw2_h", encryptor.encrypt(torch.matmul(Xw_h.T, Xw_h))) + batch_ctx.guest.put("Xw2_h", encryptor.encrypt_tensor(torch.matmul(Xw_h.T, Xw_h))) loss_norm = self.optimizer.loss_norm(w) if loss_norm is not None: - batch_ctx.guest.put("h_loss", encryptor.encrypt(loss_norm)) + batch_ctx.guest.put("h_loss", encryptor.encrypt_tensor(loss_norm)) else: batch_ctx.guest.put("h_loss", loss_norm) @@ -225,12 +226,12 @@ def asynchronous_compute_gradient(self, batch_ctx, encryptor, w, X): def centralized_compute_gradient(self, batch_ctx, encryptor, w, X): h = X.shape[0] Xw_h = 0.25 * torch.matmul(X, w.detach()) - batch_ctx.guest.put("Xw_h", encryptor.encrypt(Xw_h)) - batch_ctx.guest.put("Xw2_h", encryptor.encrypt(torch.matmul(Xw_h.T, Xw_h))) + batch_ctx.guest.put("Xw_h", encryptor.encrypt_tensor(Xw_h)) + batch_ctx.guest.put("Xw2_h", encryptor.encrypt_tensor(torch.matmul(Xw_h.T, Xw_h))) loss_norm = self.optimizer.loss_norm(w) if loss_norm is not None: - batch_ctx.guest.put("h_loss", encryptor.encrypt(loss_norm)) + batch_ctx.guest.put("h_loss", encryptor.encrypt_tensor(loss_norm)) else: batch_ctx.guest.put(h_loss=loss_norm) From 9e2086939d25d8ea4b730798b363df186e104e83 Mon Sep 17 00:00:00 2001 From: mgqa34 Date: Thu, 10 Aug 2023 16:39:51 +0800 Subject: [PATCH 19/30] dataframe: add block_row_size to manager single block size Signed-off-by: mgqa34 --- python/fate/arch/dataframe/_frame_reader.py | 43 +++++++--- python/fate/arch/dataframe/conf/__init__.py | 0 .../arch/dataframe/conf/default_config.py | 17 ++++ .../arch/dataframe/manager/data_manager.py | 13 ++- python/fate/arch/dataframe/ops/_indexer.py | 79 ++++++++++++++----- 5 files changed, 119 insertions(+), 33 deletions(-) create mode 100644 python/fate/arch/dataframe/conf/__init__.py create mode 100644 python/fate/arch/dataframe/conf/default_config.py diff --git a/python/fate/arch/dataframe/_frame_reader.py b/python/fate/arch/dataframe/_frame_reader.py index 214c521773..5b6a3c9ecd 100644 --- a/python/fate/arch/dataframe/_frame_reader.py +++ b/python/fate/arch/dataframe/_frame_reader.py @@ -17,6 +17,7 @@ from typing import Union +from .conf.default_config import DATAFRAME_BLOCK_ROW_SIZE from .entity import types from ._dataframe import DataFrame from .manager import DataManager @@ -41,7 +42,8 @@ def __init__( na_values: Union[str, list, dict] = None, input_format: str = "dense", tag_with_value: bool = False, - tag_value_delimiter: str = ":" + tag_value_delimiter: str = ":", + block_row_size: int = None ): self._sample_id_name = sample_id_name self._match_id_name = match_id_name @@ -60,6 +62,7 @@ def __init__( self._input_format = input_format self._tag_with_value = tag_with_value self._tag_value_delimiter = tag_value_delimiter + self._block_row_size = block_row_size if block_row_size is not None else DATAFRAME_BLOCK_ROW_SIZE self.check_params() @@ -67,6 +70,9 @@ def check_params(self): if not self._sample_id_name: raise ValueError("Please provide sample_id_name") + if not isinstance(self._block_row_size, int) or self._block_row_size < 0: + raise ValueError("block_row_size should be positive integer") + def to_frame(self, ctx, table): if self._input_format != "dense": raise ValueError("Only support dense input format in this version.") @@ -74,7 +80,7 @@ def to_frame(self, ctx, table): return self._dense_format_to_frame(ctx, table) def _dense_format_to_frame(self, ctx, table): - data_manager = DataManager() + data_manager = DataManager(block_row_size=self._block_row_size) columns = self._header.split(self._delimiter, -1) columns.remove(self._sample_id_name) retrieval_index_dict = data_manager.init_from_local_file( @@ -84,7 +90,7 @@ def _dense_format_to_frame(self, ctx, table): dtype=self._dtype, default_type=types.DEFAULT_DATA_TYPE) from .ops._indexer import get_partition_order_by_raw_table - partition_order_mappings = get_partition_order_by_raw_table(table) + partition_order_mappings = get_partition_order_by_raw_table(table, data_manager.block_row_size) # partition_order_mappings = _get_partition_order(table) table = table.mapValues(lambda value: value.split(self._delimiter, -1)) to_block_func = functools.partial(_to_blocks, @@ -129,7 +135,8 @@ def __init__( weight_type: str = "float32", dtype: str = "float32", na_values: Union[None, str, list, dict] = None, - partition: int = 4 + partition: int = 4, + block_row_size: int = None ): self._sample_id_name = sample_id_name self._match_id_list = match_id_list @@ -142,6 +149,7 @@ def __init__( self._dtype = dtype self._na_values = na_values self._partition = partition + self._block_row_size = block_row_size if block_row_size is not None else DATAFRAME_BLOCK_ROW_SIZE def to_frame(self, ctx, path): # TODO: use table put data instead of read all data @@ -156,6 +164,7 @@ def to_frame(self, ctx, path): weight_name=self._weight_name, dtype=self._dtype, partition=self._partition, + block_row_size=self._block_row_size ).to_frame(ctx, df) @@ -194,6 +203,7 @@ def __init__( weight_type: str = "float32", dtype: str = "float32", partition: int = 4, + block_row_size: int = None, ): self._sample_id_name = sample_id_name self._match_id_list = match_id_list @@ -204,6 +214,7 @@ def __init__( self._weight_type = weight_type self._dtype = dtype self._partition = partition + self._block_row_size = block_row_size if block_row_size is not None else DATAFRAME_BLOCK_ROW_SIZE if self._sample_id_name and not self._match_id_name: raise ValueError(f"As sample_id {self._sample_id_name} is given, match_id should be given too") @@ -215,7 +226,7 @@ def to_frame(self, ctx, df: "pd.DataFrame"): else: df = df.set_index(self._sample_id_name) - data_manager = DataManager() + data_manager = DataManager(block_row_size=self._block_row_size) retrieval_index_dict = data_manager.init_from_local_file( sample_id_name=self._sample_id_name, columns=df.columns.tolist(), match_id_list=self._match_id_list, match_id_name=self._match_id_name, label_name=self._label_name, weight_name=self._weight_name, @@ -260,11 +271,11 @@ def _to_blocks(kvs, """ sample_id/match_id,label(maybe missing),weight(maybe missing),X """ - partition_id = None + block_id = None schema = data_manager.schema - splits = [[] for idx in range(data_manager.block_num)] + splits = [[] for _ in range(data_manager.block_num)] sample_id_block = data_manager.loc_block(schema.sample_id_name, with_offset=False) if schema.sample_id_name else None match_id_block = data_manager.loc_block(schema.match_id_name, with_offset=False)if schema.match_id_name else None @@ -287,9 +298,13 @@ def _to_blocks(kvs, column_blocks_mapping[bid].append(col_id) + block_row_size = data_manager.block_row_size + + lid = 0 for key, value in kvs: - if partition_id is None: - partition_id = partition_order_mappings[key]["block_id"] + if block_id is None: + block_id = partition_order_mappings[key]["start_block_id"] + lid += 1 # columns = value.split(",", -1) splits[sample_id_block].append(key) @@ -303,6 +318,12 @@ def _to_blocks(kvs, for bid, col_id_list in column_blocks_mapping.items(): splits[bid].append([value[col_id] for col_id in col_id_list]) - converted_blocks = data_manager.convert_to_blocks(splits) + if lid % block_row_size == 0: + converted_blocks = data_manager.convert_to_blocks(splits) + yield block_id, converted_blocks + block_id += 1 + splits = [[] for _ in range(data_manager.block_num)] - return [(partition_id, converted_blocks)] + if lid % block_row_size: + converted_blocks = data_manager.convert_to_blocks(splits) + yield block_id, converted_blocks diff --git a/python/fate/arch/dataframe/conf/__init__.py b/python/fate/arch/dataframe/conf/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/fate/arch/dataframe/conf/default_config.py b/python/fate/arch/dataframe/conf/default_config.py new file mode 100644 index 0000000000..41b439b1f0 --- /dev/null +++ b/python/fate/arch/dataframe/conf/default_config.py @@ -0,0 +1,17 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +DATAFRAME_BLOCK_ROW_SIZE = 2**6 + diff --git a/python/fate/arch/dataframe/manager/data_manager.py b/python/fate/arch/dataframe/manager/data_manager.py index 5d0b5c20ad..56fc5d427e 100644 --- a/python/fate/arch/dataframe/manager/data_manager.py +++ b/python/fate/arch/dataframe/manager/data_manager.py @@ -19,12 +19,19 @@ from .block_manager import BlockType from ..entity import types from typing import Union, List, Tuple +from ..conf.default_config import DATAFRAME_BLOCK_ROW_SIZE class DataManager(object): - def __init__(self, schema_manager: SchemaManager = None, block_manager: BlockManager = None): + def __init__( + self, + schema_manager: SchemaManager = None, + block_manager: BlockManager = None, + block_row_size: int = DATAFRAME_BLOCK_ROW_SIZE + ): self._schema_manager = schema_manager self._block_manager = block_manager + self._block_row_size = block_row_size @property def blocks(self): @@ -34,6 +41,10 @@ def blocks(self): def block_num(self): return len(self._block_manager.blocks) + @property + def block_row_size(self): + return self._block_row_size + @property def schema(self): return self._schema_manager.schema diff --git a/python/fate/arch/dataframe/ops/_indexer.py b/python/fate/arch/dataframe/ops/_indexer.py index a054007a66..e92f698be5 100644 --- a/python/fate/arch/dataframe/ops/_indexer.py +++ b/python/fate/arch/dataframe/ops/_indexer.py @@ -41,7 +41,6 @@ def _aggregate(kvs): return list(aggregate_ret.items()) agg_indexer = indexer.mapReducePartitions(_aggregate, lambda l1, l2: l1 + l2) - # agg_indexer = agg_indexer.mapValues(lambda v: sorted(v, key=lambda x: x[1])) return agg_indexer @@ -60,19 +59,38 @@ def _convert_to_order_index(kvs): use_previous_behavior=False) -def get_partition_order_mappings(block_table): - block_info = sorted(list(block_table.mapValues(lambda blocks: (blocks[0][0], len(blocks[0]))).collect())) +def get_partition_order_mappings_by_block_table(block_table, block_row_size): + def _block_counter(kvs): + partition_key = None + size = 0 + first_block_id = 0 + for k, v in kvs: + if partition_key is None: + partition_key = k + + size += len(v[0]) + + return first_block_id, (partition_key, size) + + block_info = sorted([summary[1] for summary in block_table.applyPartitions(_block_counter).collect()]) block_order_mappings = dict() start_index = 0 + acc_block_num = 0 for block_id, (block_key, block_size) in block_info: + block_num = (block_size + block_row_size - 1) // block_row_size block_order_mappings[block_key] = dict( - start_index=start_index, end_index=start_index + block_size - 1, block_id=block_id) + start_index=start_index, + end_index=start_index + block_size - 1, + start_block_id=acc_block_num, + end_block_id=acc_block_num + block_num - 1 + ) start_index += block_size + acc_block_num += block_num return block_order_mappings -def get_partition_order_by_raw_table(table): +def get_partition_order_by_raw_table(table, block_row_size): def _get_block_summary(kvs): try: key = next(kvs)[0] @@ -84,15 +102,19 @@ def _get_block_summary(kvs): block_summary = table.mapPartitions(_get_block_summary).reduce(lambda blk1, blk2: {**blk1, **blk2}) - start_index, block_id = 0, 0 + start_index, acc_block_num = 0, 0 block_order_mappings = dict() for blk_key, blk_size in block_summary.items(): + block_num = (blk_size + block_row_size - 1) // block_row_size block_order_mappings[blk_key] = dict( - start_index=start_index, end_index=start_index + blk_size - 1, block_id=block_id + start_index=start_index, + end_index=start_index + blk_size - 1, + start_block_id=acc_block_num, + end_block_id=acc_block_num + block_num - 1 ) start_index += blk_size - block_id += 1 + acc_block_num += block_num return block_order_mappings @@ -198,7 +220,7 @@ def _convert_to_block(kvs): block_table = block_table.mapValues(lambda values: [v[1] for v in values]) block_table = transform_list_block_to_frame_block(block_table, df.data_manager) - partition_order_mappings = get_partition_order_mappings(block_table) + partition_order_mappings = get_partition_order_mappings_by_block_table(block_table, df.data_manager.block_row_size) return DataFrame( df._ctx, block_table, @@ -249,18 +271,18 @@ def _retrieval_mapper(key, value): return retrieval_ret agg_indexer = indexer.mapReducePartitions(_agg_mapper, _agg_reducer) - raw_table = df.block_table.join(agg_indexer, lambda v1, v2: (v1, v2)).flatMap(_retrieval_mapper) - - partition_order_mappings = get_partition_order_by_raw_table(raw_table) + partition_order_mappings = get_partition_order_by_raw_table(raw_table, data_manager.block_row_size) def _convert_to_blocks(kvs): bid = None ret_blocks = [[] for _ in range(block_num)] - for offset, (sample_id, data) in enumerate(kvs): + lid = 0 + for sample_id, data in kvs: + lid += 1 if bid is None: - bid = partition_order_mappings[sample_id]["block_id"] + bid = partition_order_mappings[sample_id]["start_block_id"] if return_new_indexer: data = data[0] @@ -271,9 +293,15 @@ def _convert_to_blocks(kvs): else: ret_blocks[i].append(data[i]) - ret_blocks = [data_manager.blocks[i].convert_block(block) for i, block in enumerate(ret_blocks)] + if lid % data_manager.block_row_size == 0: + ret_blocks = [data_manager.blocks[i].convert_block(block) for i, block in enumerate(ret_blocks)] + yield bid, ret_blocks + bid += 1 + ret_blocks = [[] for _ in range(block_num)] - return [(bid, ret_blocks)] + if lid % data_manager.block_row_size: + ret_blocks = [data_manager.blocks[i].convert_block(block) for i, block in enumerate(ret_blocks)] + yield bid, ret_blocks block_table = raw_table.mapPartitions(_convert_to_blocks, use_previous_behavior=False) @@ -289,11 +317,17 @@ def _convert_to_blocks(kvs): else: def _mapper(kvs): bid = None - for offset, (sample_id, (_, k)) in enumerate(kvs): + offset = 0 + for sample_id, (_, k) in kvs: if bid is None: - bid = partition_order_mappings[sample_id]["block_id"] + bid = partition_order_mappings[sample_id]["start_block_id"] yield k, [(sample_id, bid, offset)] + offset += 1 + + if offset == data_manager.block_row_size: + bid += 1 + offset = 0 new_indexer = raw_table.mapReducePartitions(_mapper, lambda v1, v2: v1 + v2) @@ -304,7 +338,7 @@ def loc_with_sample_id_replacement(df: DataFrame, indexer): """ indexer: table, row: (key=random_key, - value=((src_partition_id, src_offset), [(sample_id, dst_partition_id, dst_offset) ...]) + value=((src_partition_id, src_offset), [(sample_id, dst_block_id, dst_offset) ...]) """ agg_indexer = aggregate_indexer(indexer) @@ -316,7 +350,6 @@ def _convert_to_block(kvs): """ block_indexer: row_id, [(sample_id, new_block_id, new_row_id)...] """ - for src_row_id, dst_indexer_list in block_indexer: for sample_id, dst_block_id, dst_row_id in dst_indexer_list: if dst_block_id not in ret_dict: @@ -343,7 +376,11 @@ def _convert_to_block(kvs): block_table = block_table.mapValues(lambda values: [v[1] for v in values]) block_table = transform_list_block_to_frame_block(block_table, df.data_manager) - partition_order_mappings = get_partition_order_mappings(block_table) + partition_order_mappings = get_partition_order_mappings_by_block_table( + block_table, + df.data_manager.block_row_size + ) + return DataFrame( df._ctx, block_table, From c557b2f7c4e08827ae55ae23218e2a85a1027383 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Thu, 10 Aug 2023 20:00:13 +0800 Subject: [PATCH 20/30] fix LR ovr predict(#4659) fix fate-test data cli(#5008) edit bq examples & add pipeline testsuite(#5008) Signed-off-by: Yu Wu --- doc/api/fate_test.md | 916 ++++++++++++++++++ doc/tutorial/fate_test_tutorial.md | 91 ++ .../coordinated_lr/test_lr_sid.py | 18 +- .../lr/default_credit_config.yaml | 4 +- .../lr/epsilon_5k_config.yaml | 2 +- .../lr/give_credit_config.yaml | 10 +- .../benchmark_quality/lr/lr_benchmark.yaml | 36 +- .../benchmark_quality/lr/pipeline-lr-multi.py | 1 + .../benchmark_quality/lr/sklearn-lr-multi.py | 4 +- .../pipeline/{coordinated_lr => }/config.yaml | 0 .../coordinated_linr_testsuite.yaml | 60 ++ .../pipeline/coordinated_linr/test_linr.py | 87 ++ .../pipeline/coordinated_linr/test_linr_cv.py | 64 ++ .../coordinated_linr/test_linr_multi_host.py | 93 ++ .../coordinated_linr/test_linr_warm_start.py | 95 ++ .../coordinated_lr_testsuite.yaml | 62 +- .../{test_lr_sid.py => test_lr.py} | 8 +- .../{test_lr_sid_cv.py => test_lr_cv.py} | 8 +- .../coordinated_lr/test_lr_multi_class.py | 94 ++ .../test_lr_multi_host.py} | 44 +- .../coordinated_lr/test_lr_validate.py | 80 ++ ...id_warm_start.py => test_lr_warm_start.py} | 7 +- .../data_split/data_split_lr_testsuite.yaml | 40 + .../pipeline/data_split/test_data_split.py | 91 ++ .../data_split/test_data_split_stratified.py | 94 ++ .../binning_testsuite.yaml | 42 + .../test_feature_binning_asymmetric.py | 92 ++ .../test_feature_binning_bucket.py | 96 ++ .../test_feature_binning_quantile.py | 91 ++ .../selection_testsuite.yaml | 44 + .../test_feature_selection_binning.py | 88 ++ .../test_feature_selection_manual.py | 80 ++ .../test_feature_selection_multi_model.py | 94 ++ .../test_feature_selection_statistics.py | 83 ++ examples/pipeline/multi_model/test_multi.py | 129 +++ .../pipeline/sample/sample_testsuite.yaml | 40 + examples/pipeline/sample/test_sample.py | 79 ++ .../pipeline/sample/test_sample_unilateral.py | 80 ++ examples/pipeline/scale/scale_testsuite.yaml | 42 + examples/pipeline/scale/test_scale_min_max.py | 99 ++ .../pipeline/scale/test_scale_standard.py | 94 ++ examples/pipeline/scale/test_scale_w_lr.py | 103 ++ .../statistics/statistics_testsuite.yaml | 38 + .../pipeline/statistics/test_statistics.py | 61 ++ examples/pipeline/test_data_split.py | 68 -- .../pipeline/test_data_split_stratified.py | 69 -- examples/pipeline/test_linr_sid_cv.py | 38 - examples/pipeline/test_linr_sid_warm_start.py | 89 -- examples/pipeline/test_sample.py | 62 -- examples/pipeline/test_scale.py | 72 -- examples/pipeline/test_single_linr.py | 72 -- examples/pipeline/test_single_lr.py | 71 -- examples/pipeline/test_single_lr_multi.py | 73 -- examples/pipeline/union/test_union.py | 81 ++ examples/pipeline/union/union_testsuite.yaml | 38 + .../ml/glm/hetero/coordinated_lr/guest.py | 2 +- .../fate_test/fate_test/scripts/data_cli.py | 46 +- 57 files changed, 3559 insertions(+), 706 deletions(-) create mode 100644 doc/api/fate_test.md create mode 100644 doc/tutorial/fate_test_tutorial.md rename examples/pipeline/{coordinated_lr => }/config.yaml (100%) create mode 100644 examples/pipeline/coordinated_linr/coordinated_linr_testsuite.yaml create mode 100644 examples/pipeline/coordinated_linr/test_linr.py create mode 100644 examples/pipeline/coordinated_linr/test_linr_cv.py create mode 100644 examples/pipeline/coordinated_linr/test_linr_multi_host.py create mode 100644 examples/pipeline/coordinated_linr/test_linr_warm_start.py rename examples/pipeline/coordinated_lr/{test_lr_sid.py => test_lr.py} (94%) rename examples/pipeline/coordinated_lr/{test_lr_sid_cv.py => test_lr_cv.py} (91%) create mode 100644 examples/pipeline/coordinated_lr/test_lr_multi_class.py rename examples/pipeline/{test_single_lr_multi_host.py => coordinated_lr/test_lr_multi_host.py} (74%) create mode 100644 examples/pipeline/coordinated_lr/test_lr_validate.py rename examples/pipeline/coordinated_lr/{test_lr_sid_warm_start.py => test_lr_warm_start.py} (95%) create mode 100644 examples/pipeline/data_split/data_split_lr_testsuite.yaml create mode 100644 examples/pipeline/data_split/test_data_split.py create mode 100644 examples/pipeline/data_split/test_data_split_stratified.py create mode 100644 examples/pipeline/hetero_feature_binning/binning_testsuite.yaml create mode 100644 examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py create mode 100644 examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py create mode 100644 examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py create mode 100644 examples/pipeline/hetero_feature_selection/selection_testsuite.yaml create mode 100644 examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py create mode 100644 examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py create mode 100644 examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py create mode 100644 examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py create mode 100644 examples/pipeline/multi_model/test_multi.py create mode 100644 examples/pipeline/sample/sample_testsuite.yaml create mode 100644 examples/pipeline/sample/test_sample.py create mode 100644 examples/pipeline/sample/test_sample_unilateral.py create mode 100644 examples/pipeline/scale/scale_testsuite.yaml create mode 100644 examples/pipeline/scale/test_scale_min_max.py create mode 100644 examples/pipeline/scale/test_scale_standard.py create mode 100644 examples/pipeline/scale/test_scale_w_lr.py create mode 100644 examples/pipeline/statistics/statistics_testsuite.yaml create mode 100644 examples/pipeline/statistics/test_statistics.py delete mode 100644 examples/pipeline/test_data_split.py delete mode 100644 examples/pipeline/test_data_split_stratified.py delete mode 100644 examples/pipeline/test_linr_sid_cv.py delete mode 100644 examples/pipeline/test_linr_sid_warm_start.py delete mode 100644 examples/pipeline/test_sample.py delete mode 100644 examples/pipeline/test_scale.py delete mode 100644 examples/pipeline/test_single_linr.py delete mode 100644 examples/pipeline/test_single_lr.py delete mode 100644 examples/pipeline/test_single_lr_multi.py create mode 100644 examples/pipeline/union/test_union.py create mode 100644 examples/pipeline/union/union_testsuite.yaml diff --git a/doc/api/fate_test.md b/doc/api/fate_test.md new file mode 100644 index 0000000000..c6bd4c4af4 --- /dev/null +++ b/doc/api/fate_test.md @@ -0,0 +1,916 @@ +# FATE Test + +A collection of useful tools to running FATE's test. + +## Testsuite + +Testsuite is used for running a collection of jobs in sequence. Data +used for jobs could be uploaded before jobs are submitted and, +optionally, be cleaned after jobs finish. This tool is useful for FATE's +release test. + +### command options + +```bash +fate_test suite --help +``` + +1. include: + + ```bash + fate_test suite -i + ``` + + will run testsuites in + *path1* + +2. exclude: + + ```bash + fate_test suite -i -e -e ... + ``` + + will run testsuites in *path1* but not in *path2* and *path3* + +3. glob: + + ```bash + fate_test suite -i -g "hetero*" + ``` + + will run testsuites in sub directory start with *hetero* of + *path1* + +4. timeout: + + ```bash + fate_test suite -i -m 3600 + ``` + + will run testsuites in *path1* and timeout when job does not finish + within 3600s; if tasks need more time, use a larger threshold + +5. task-cores + + ```bash + fate_test suite -i -p 4 + ``` + + will run testsuites in *path1* with EGGROLL "task-cores" set to 4; + only effective for DSL conf + +6. skip-data: + + ```bash + fate_test suite -i --skip-data + ``` + + will run testsuites in *path1* without uploading data specified in + *testsuite.yaml*. + +7. data-only: + + ```bash + fate_test suite -i --data-only + ``` + + will only upload data specified in *testsuite.yaml* without running + jobs + +8. disable-clean-data: + + ```bash + fate_test suite -i --disable-clean-data + ``` + + will run testsuites in *path1* without removing data from storage + after tasks + finish + +9. enable-clean-data: + + ```bash + fate_test suite -i --enable-clean-data + ``` + + will remove data from storage after finishing running testsuites + +10. yes: + + ```bash + fate_test suite -i --yes + ``` + + will run testsuites in *path1* directly, skipping double check + +### testsuite configuration + +Configuration of jobs should be specified in a testsuite whose file name +ends with "\*testsuite.yaml". For testsuite examples, please refer [pipeline +examples](../../examples/pipeline). + +A testsuite includes the following elements: + +- data: list of local data to be uploaded before running FATE jobs + + - file: path to original data file to be uploaded, should be + relative to testsuite or FATE installation path + - meta: information regarding parsing input data, including + - delimiter + - dtype, + - label\_type + - weight\_type + - input format + - match\_id\_name + - sample\_id\_name + - partitions: number of partition for data storage + - head: whether table includes header + - extend_sid: whether automatically extend sample id + - table\_name: table name in storage + - namespace: table namespace in storage + - role: which role to upload the data, as specified in + fate\_test.config; naming format is: + "{role\_type}\_{role\_index}", index starts at 0 + + ```yaml + data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + ``` + +- tasks: includes arbitrary number of pipeline jobs with + paths to corresponding python script + + - job: name of job to be run, must be unique within each group + list + + - script: path to pipeline script, should be relative to + testsuite + + ```yaml + tasks: + normal-lr: + script: test_lr_sid.py + ``` + +## Benchmark Quality + +Benchmark-quality is used for comparing modeling quality between FATE +and other machine learning systems. Benchmark produces a metrics +comparison summary for each benchmark job group. + +Benchmark can also compare metrics of different models from the same +script/PipeLine job. Please refer to the [script writing +guide](#testing-script(quality)) below for +instructions. + +```bash +fate_test benchmark-quality -i examples/benchmark_quality/hetero_linear_regression +``` + +```bash +|----------------------------------------------------------------------| +| Data Summary | +|-------+--------------------------------------------------------------| +| Data | Information | +|-------+--------------------------------------------------------------| +| train | {'guest': 'motor_hetero_guest', 'host': 'motor_hetero_host'} | +| test | {'guest': 'motor_hetero_guest', 'host': 'motor_hetero_host'} | +|-------+--------------------------------------------------------------| + + +|-------------------------------------------------------------------------------------------------------------------------------------| +| Metrics Summary | +|-------------------------------------------+-------------------------+--------------------+---------------------+--------------------| +| Model Name | root_mean_squared_error | r2_score | mean_squared_error | explained_variance | +|-------------------------------------------+-------------------------+--------------------+---------------------+--------------------| +| local-hetero_linear_regression-regression | 0.312552080517407 | 0.9040310440206087 | 0.09768880303575968 | 0.9040312584426697 | +| FATE-hetero_linear_regression-regression | 0.3139977881119483 | 0.9031411831961411 | 0.09859461093919598 | 0.903146386539082 | +|-------------------------------------------+-------------------------+--------------------+---------------------+--------------------| +|-------------------------------------| +| Match Results | +|-------------------------+-----------| +| Metric | All Match | +| root_mean_squared_error | True | +| r2_score | True | +| mean_squared_error | True | +| explained_variance | True | +|-------------------------+-----------| + + +|-------------------------------------------------------------------------------------| +| FATE Script Metrics Summary | +|--------------------+---------------------+--------------------+---------------------| +| Script Model Name | min | max | mean | +|--------------------+---------------------+--------------------+---------------------| +| linr_train-FATE | -1.5305666678748353 | 1.4968292506353484 | 0.03948016870496807 | +| linr_validate-FATE | -1.5305666678748353 | 1.4968292506353484 | 0.03948016870496807 | +|--------------------+---------------------+--------------------+---------------------| +|---------------------------------------| +| FATE Script Metrics Match Results | +|----------------+----------------------| +| Metric | All Match | +|----------------+----------------------| +| min | True | +| max | True | +| mean | True | +|----------------+----------------------| +``` + +### command options + +use the following command to show help message + +```bash +fate_test benchmark-quality --help +``` + +1. include: + + ```bash + fate_test benchmark-quality -i + ``` + + will run benchmark testsuites in + *path1* + +2. exclude: + + ```bash + fate_test benchmark-quality -i -e -e ... + ``` + + will run benchmark testsuites in *path1* but not in *path2* and + *path3* + +3. glob: + + ```bash + fate_test benchmark-quality -i -g "hetero*" + ``` + + will run benchmark testsuites in sub directory start with *hetero* + of + *path1* + +4. tol: + + ```bash + fate_test benchmark-quality -i -t 1e-3 + ``` + + will run benchmark testsuites in *path1* with absolute tolerance of + difference between metrics set to 0.001. If absolute difference + between metrics is smaller than *tol*, then metrics are considered + almost equal. Check benchmark testsuite [writing + guide](#benchmark-testsuite) on setting alternative tolerance. + +5. skip-data: + + ```bash + fate_test benchmark-quality -i --skip-data + ``` + + will run benchmark testsuites in *path1* without uploading data + specified in + *benchmark.yaml*. + +6. data-only: + + ```bash + fate_test benchmark-quality -i --data-only + ``` + + will only upload data specified in *testsuite.yaml* without running + jobs + +7. disable-clean-data: + + ```bash + fate_test benchmark-quality -i --disable-clean-data + ``` + + will run benchmark testsuites in *path1* without removing data from + storage after tasks + finish + +8. enable-clean-data: + + ```bash + fate_test benchmark-quality -i --enable-clean-data + ``` + + will remove data from storage after finishing running benchmark + testsuites + +9. yes: + ```bash + fate_test benchmark-quality -i --yes + ``` + + will run benchmark testsuites in *path1* directly, skipping double + check + +### benchmark quality job configuration + +Configuration of jobs should be specified in a benchmark quality testsuite whose +file name ends with "\*benchmark.yaml". For benchmark testsuite example, +please refer [here](../../examples/benchmark_quality). + +A benchmark testsuite includes the following elements: + +- data: list of local data to be uploaded before running FATE jobs + + - file: path to original data file to be uploaded, should be + relative to testsuite or FATE installation path + - meta: information regarding parsing input data, including + - delimiter + - dtype, + - label\_type + - weight\_type + - input format + - match\_id\_name + - sample\_id\_name + - partitions: number of partition for data storage + - head: whether table includes header + - extend_sid: whether automatically extend sample id + - table\_name: table name in storage + - namespace: table namespace in storage + - role: which role to upload the data, as specified in + fate\_test.config; naming format is: + "{role\_type}\_{role\_index}", index starts at 0 + + ```yaml + data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + ``` + +- job group: each group includes arbitrary number of jobs with paths + to corresponding script and configuration + + - job: name of job to be run, must be unique within each group + list + + - script: path to [testing script](#testing-script(quality)), should be + relative to testsuite + - conf: path to job configuration file for script, should be + relative to testsuite + + ```yaml + "local": { + "script": "./local-linr.py", + "conf": "./linr_config.yaml" + } + ``` + + - compare\_setting: additional setting for quality metrics + comparison, currently only takes `relative_tol` + + If metrics *a* and *b* satisfy *abs(a-b) \<= max(relative\_tol + \* max(abs(a), abs(b)), absolute\_tol)* (from [math + module](https://docs.python.org/3/library/math.html#math.isclose)), + they are considered almost equal. In the below example, metrics + from "local" and "FATE" jobs are considered almost equal if + their relative difference is smaller than *0.05 \* + max(abs(local\_metric), abs(pipeline\_metric)*. + + ```yaml + "linear_regression-regression": { + "local": { + "script": "./local-linr.py", + "conf": "./linr_config.yaml" + }, + "FATE": { + "script": "./fate-linr.py", + "conf": "./linr_config.yaml" + }, + "compare_setting": { + "relative_tol": 0.01 + } + } + ``` + +### testing script(quality) + +All job scripts need to have `Main` function as an entry point for +executing jobs; scripts should return two dictionaries: first with data +information key-value pairs: {data\_type}: {data\_name\_dictionary}; the +second contains {metric\_name}: {metric\_value} key-value pairs for +metric comparison. + +By default, the final data summary shows the output from the job named +"FATE"; if no such job exists, data information returned by the first +job is shown. For clear presentation, we suggest that user follow this +general [guideline](../../examples/data/README.md#data-set-naming-rule) +for data set naming. In the case of multi-host task, consider numbering +host as such: + + {'guest': 'default_credit_homo_guest', + 'host_1': 'default_credit_homo_host_1', + 'host_2': 'default_credit_homo_host_2'} + +Returned quality metrics of the same key are to be compared. Note that +only **real-value** metrics can be compared. + +To compare metrics of different models from the same script, metrics of +each model need to be wrapped into dictionary in the same format as the +general metric output above. + +In the returned dictionary of script, use reserved key `script_metrics` +to indicate the collection of metrics to be compared. + +- FATE script: `Main` should have three inputs: + - config: job configuration, + [JobConfig](../../python/fate_client/pipeline/utils/fate_utils.py) + object loaded from "fate\_test\_config.yaml" + - param: job parameter setting, dictionary loaded from "conf" file + specified in benchmark testsuite + - namespace: namespace suffix, user-given *namespace* or generated + timestamp string when using *namespace-mangling* +- non-FATE script: `Main` should have one or two inputs: + - param: job parameter setting, dictionary loaded from "conf" file + specified in benchmark testsuite + - (optional) config: job configuration, + [JobConfig](../../python/fate_client/pipeline/utils/fate_utils.py) + object loaded from "fate\_test\_config.yaml" + +Note that `Main` in FATE & non-FATE scripts can also be set to take zero +input argument. + +## Benchmark Performance + +`Performance` sub-command is used to test +efficiency of designated FATE jobs. + +Example tests may be found [here](../../examples/benchmark_performance). + +### command options + +```bash +fate_test performance --help +``` + +1. job-type: + + ```bash + fate_test performance -t intersect + ``` + + will run testsuites from intersect subdirectory (set in config) in + the default performance directory; note that only one of `task` and + `include` is + needed + +2. include: + + ```bash + fate_test performance -i ; note that only one of ``task`` and ``include`` needs to be specified. + ``` + + will run testsuites in *path1*. Note that only one of `task` and + `include` needs to be specified; when both are given, path from + `include` takes + priority. + +3. timeout: + + ```bash + fate_test performance -i -m 3600 + ``` + + will run testsuites in *path1* and timeout when job does not finish + within 3600s; if tasks need more time, use a larger threshold + +4. epochs: + + ```bash + fate_test performance -i -e 5 + ``` + + will run testsuites in *path1* with all values to key "max\_iter" + set to 5 + +5. max-depth + + ```bash + fate_test performance -i -d 4 + ``` + + will run testsuites in *path1* with all values to key "max\_depth" + set to 4 + +6. num-trees + + ```bash + fate_test performance -i -nt 5 + ``` + + will run testsuites in *path1* with all values to key "num\_trees" + set to 5 + +7. task-cores + + ```bash + fate_test performance -i -p 4 + ``` + + will run testsuites in *path1* with EGGROLL "task\_cores" set to 4 + +8. storage-tag + + ```bash + fate_test performance -i -s test + ``` + + will run testsuites in *path1* with performance time stored under + provided tag for future comparison; note that FATE-Test always + records the most recent run for each tag; if the same tag is used + more than once, only performance from the latest job is + kept + +9. history-tag + + ```bash + fate_test performance -i -v test1 -v test2 + ``` + + will run performance testsuites in *path1* with performance time compared to + history jobs under provided + tag(s) + +10. skip-data: + + ```bash + fate_test performance -i --skip-data + ``` + + will run performance testsuites in *path1* without uploading data specified in + *testsuite.yaml*. + +11. data-only: + + ```bash + fate_test performance -i --data-only + ``` + + will only upload data specified in *performance.yaml* without running + jobs + +12. disable-clean-data: + + ```bash + fate_test performance -i --disable-clean-data + ``` + + will run testsuites in *path1* without removing data from storage + after tasks finish + +14. yes: + + ```bash + fate_test performance -i --yes + ``` + + will run testsuites in *path1* directly, skipping double check + +Configuration of jobs should be specified in a benchmark performance testsuite whose +file name ends with "\*performance.yaml". For benchmark testsuite example, +please refer [here](../../examples/benchmark_performance). + +A benchmark testsuite includes the following elements: + +- data: list of local data to be uploaded before running FATE jobs + + - file: path to original data file to be uploaded, should be + relative to testsuite or FATE installation path + - meta: information regarding parsing input data, including + - delimiter + - dtype, + - label\_type + - weight\_type + - input format + - match\_id\_name + - sample\_id\_name + - partitions: number of partition for data storage + - head: whether table includes header + - extend_sid: whether automatically extend sample id + - table\_name: table name in storage + - namespace: table namespace in storage + - role: which role to upload the data, as specified in + fate\_test.config; naming format is: + "{role\_type}\_{role\_index}", index starts at 0 + + ```yaml + data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + ``` +- tasks: includes arbitrary number of pipeline jobs with + paths to corresponding python script + + - job: name of job to be run, must be unique within each group + list + + - script: path to [testing script](#testing-script(performance))), should be + relative to testsuite + - conf: path to job configuration file for script, should be + relative to testsuite + + ```yaml + "local": { + "script": "./local-linr.py", + "conf": "./linr_config.yaml" + } + ``` + +### testing script(performance) + +All job scripts need to have `Main` function as an entry point for +executing jobs; scripts should obtain and return job id of pipeline job as follows: + +```python +from fate_client.pipeline import FateFlowPipeline + +pipeline = FateFlowPipeline() +... +pipeline.compile() +pipeline.fit() +job_id = pipeline.model_info.job_id +print(job_id) +``` + +Returned job id will be used to query job status and time usage details for each component in job. + +- FATE script: `Main` should have three inputs: + - config: job configuration, + [JobConfig](../../python/fate_client/pipeline/utils/fate_utils.py) + object loaded from "fate\_test\_config.yaml" + - param: job parameter setting, dictionary loaded from "conf" file + specified in benchmark performance testsuite + - namespace: namespace suffix, user-given *namespace* or generated + timestamp string when using *namespace-mangling* + +Note that `Main` in FATE scripts can also be set to take zero +input argument. + +## data + +`Data` sub-command is used for upload, +delete, and generate dataset. + +### data command options + +```bash +fate_test data --help +``` + +1. include: + + ```bash + fate_test data [upload|delete] -i + ``` + + will upload/delete dataset in testsuites in + *path1* + +2. exclude: + + ```bash + fate_test data [upload|delete] -i -e -e ... + ``` + + will upload/delete dataset in testsuites in *path1* but not in + *path2* and + *path3* + +3. glob: + + ```bash + fate_test data [upload|delete] -i -g "hetero*" + ``` + + will upload/delete dataset in testsuites in sub directory start with + *hetero* of + *path1* + +4. upload example data: + + ```bash + fate_test data upload -t [min_test|all_examples] + ``` + + will upload dataset for min_test or all examples of fate. Once command is executed successfully, + you are expected to see the following feedback which showing the table information for you: + + ```bash + [2020-06-12 14:19:39]uploading @examples/data/breast_hetero_guest.csv >> experiment.breast_hetero_guest + [2020-06-12 14:19:39]upload done @examples/data/breast_hetero_guest.csv >> experiment.breast_hetero_guest, job_id=2020061214193960279930 + [2020-06-12 14:19:42]2020061214193960279930 success, elapse: 0:00:02 + [2020-06-12 14:19:42] check_data_out {'data': {'count': 569, 'namespace': 'experiment', 'partition': 16, 'table_name': 'breast_hetero_guest'}, 'retcode': 0, 'retmsg': 'success'} + ``` + + Note: uploading configurations are [min_test_config](../../examples/data/upload_config/min_test_data_testsuite.yaml) + and [all_examples](../../examples/data/upload_config/all_examples_data_testsuite.yaml), + user can add more data by modifying them or check out the example data's name and namespace. + +6. download mnist data: + + ```bash + fate_test data download -t mnist -o ${mnist_data_dir} + ``` + + -t: if not specified, default is "mnist" + -o: directory of download data, default is "examples/data" + +### generate command options + +```bash +fate_test data --help +``` + +1. include: + + ```bash + fate_test data generate -i + ``` + + will generate dataset in testsuites in *path1*; note that only one + of `type` and `include` is + needed + +2. host-data-type: + + ```bash + fate_test suite -i -ht {tag-value | dense | tag } + ``` + + will generate dataset in testsuites *path1* where host data are of + selected + format + +3. sparsity: + + ```bash + fate_test suite -i -s 0.2 + ``` + + will generate dataset in testsuites in *path1* with sparsity at 0.1; + useful for tag-formatted + data + +4. encryption-type: + + ```bash + fate_test data generate -i -p {sha256 | md5} + ``` + + will generate dataset in testsuites in *path1* with hash id using + SHA256 + method + +5. match-rate: + + ```bash + fate_test data generate -i -m 1.0 + ``` + + will generate dataset in testsuites in *path1* where generated host + and guest data have intersection rate of + 1.0 + +6. guest-data-size: + + ```bash + fate_test data generate -i -ng 10000 + ``` + + will generate dataset in testsuites *path1* where guest data each + have 10000 + entries + +7. host-data-size: + + ```bash + fate_test data generate -i -nh 10000 + ``` + + will generate dataset in testsuites *path1* where host data have + 10000 + entries + +8. guest-feature-num: + + ```bash + fate_test data generate -i -fg 20 + ``` + + will generate dataset in testsuites *path1* where guest data have 20 + features + +9. host-feature-num: + + ```bash + fate_test data generate -i -fh 200 + ``` + + will generate dataset in testsuites *path1* where host data have 200 + features + +10. output-path: + + ```bash + fate_test data generate -i -o + ``` + + will generate dataset in testsuites *path1* and write file to + *path2* + +11. force: + + ```bash + fate_test data generate -i -o --force + ``` + + will generate dataset in testsuites *path1* and write file to + *path2*; will overwrite existing file(s) if designated file name + found under + *path2* + +12. split-host: + + ```bash + fate_test data generate -i -nh 10000 --split-host + ``` + + will generate dataset in testsuites *path1*; 10000 entries will be + divided equally among all host data + sets + +13. upload-data + + ```bash + fate_test data generate -i --upload-data + ``` + + will generate dataset in testsuites *path1* and upload generated + data for all parties to + FATE + +14. remove-data + + ```bash + fate_test data generate -i --upload-data --remove-data + ``` + + (effective with `upload-data` set to True) will delete generated + data after generate and upload dataset in testsuites + *path1* diff --git a/doc/tutorial/fate_test_tutorial.md b/doc/tutorial/fate_test_tutorial.md new file mode 100644 index 0000000000..29a08bbc71 --- /dev/null +++ b/doc/tutorial/fate_test_tutorial.md @@ -0,0 +1,91 @@ +# FATE Test Tutorial + +A collection of useful tools to running FATE tests and [:file_folder:examples](../../examples). + +## quick start + +1. install + + ```bash + pip install -e python/fate_test + ``` +2. edit default fate\_test\_config.yaml + + ```bash + # edit priority config file with system default editor + # filling some field according to comments + fate_test config edit + ``` + +3. configure FATE-Flow Commandline server setting + + ```bash + # configure FATE-Flow Commandline server setting + flow init --port 9380 --ip 127.0.0.1 + ``` + +4. run some fate\_test suite + + ```bash + fate_test suite -i + ``` + +5. run some fate\_test benchmark quality + + ```bash + fate_test benchmark-quality -i + ``` + +6. run some fate\_test benchmark performance + + ```bash + fate_test benchmark-quality -i + ``` + +7useful logs or exception will be saved to logs dir with namespace +shown in last step + +## command types + +- [suite](../api/fate_test.md#testsuite): used for running [testsuites](../api/fate_test.md#testsuite-configuration), + collection of FATE jobs + + ```bash + fate_test suite -i + ``` + +- [data](../api/fate_test.md#data): used for upload, delete, and generate dataset + + - [upload/delete data](../api/fate_test.md#data-command-options) command: + + ```bash + fate_test data [upload|delete] -i + ``` + - [upload example data of min_test/all_examples](../api/fate_test.md#data-command-options) command: + + ```bash + fate_test data upload -t min_test + fate_test data upload -t all_examples + ``` + + - [generate data](../api/fate_test.md#generate-command-options) command: + + ```bash + fate_test data generate -i + ``` + +- [benchmark-quality](../api/fate_test.md#benchmark-quality): used for comparing modeling quality between FATE + and other machine learning systems, as specified + in [benchmark job configuration](../api/fate_test.md#benchmark-job-configuration) + + ```bash + fate_test bq -i + ``` + +- [benchmark-performance](../api/fate_test.md#benchmark-performance): used for checking FATE algorithm performance; user + should first generate and upload data before running performance testsuite + + ```bash + fate_test data generate -i -ng 10000 -fg 10 -fh 10 -m 1.0 --upload-data + fate_test performance -i --skip-data + ``` \ No newline at end of file diff --git a/examples/benchmark_performance/coordinated_lr/test_lr_sid.py b/examples/benchmark_performance/coordinated_lr/test_lr_sid.py index fc3f69209a..dd447362d8 100644 --- a/examples/benchmark_performance/coordinated_lr/test_lr_sid.py +++ b/examples/benchmark_performance/coordinated_lr/test_lr_sid.py @@ -17,7 +17,7 @@ import argparse from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import CoordinatedLR, PSI from fate_client.pipeline.components.fate import Evaluation from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils @@ -48,11 +48,11 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): if config.timeout: pipeline.conf.set("timeout", config.timeout) - intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], - namespace=guest_train_data["namespace"])) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], - namespace=host_train_data["namespace"])) + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name=guest_train_data["name"], + namespace=guest_train_data["namespace"])) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name=host_train_data["name"], + namespace=host_train_data["namespace"])) lr_param = { } @@ -68,10 +68,10 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): } lr_param.update(config_param) lr_0 = CoordinatedLR("lr_0", - train_data=intersect_0.outputs["output_data"], + train_data=psi_0.outputs["output_data"], **lr_param) lr_1 = CoordinatedLR("lr_1", - test_data=intersect_0.outputs["output_data"], + test_data=psi_0.outputs["output_data"], input_model=lr_0.outputs["output_model"]) evaluation_0 = Evaluation("evaluation_0", @@ -80,7 +80,7 @@ def main(config="../../config.yaml", param="./lr_config.yaml", namespace=""): metrics=["auc", "binary_precision", "binary_accuracy", "binary_recall"], input_data=lr_0.outputs["train_output_data"]) - pipeline.add_task(intersect_0) + pipeline.add_task(psi_0) pipeline.add_task(lr_0) pipeline.add_task(lr_1) pipeline.add_task(evaluation_0) diff --git a/examples/benchmark_quality/lr/default_credit_config.yaml b/examples/benchmark_quality/lr/default_credit_config.yaml index 8033d8af0d..b547c333b9 100644 --- a/examples/benchmark_quality/lr/default_credit_config.yaml +++ b/examples/benchmark_quality/lr/default_credit_config.yaml @@ -2,7 +2,7 @@ data_guest: "default_credit_hetero_guest" data_host: "default_credit_hetero_host" idx: "id" label_name: "y" -epochs: 22 +epochs: 30 init_param: fit_intercept: True method: "zeros" @@ -17,6 +17,6 @@ optimizer: penalty: "L2" alpha: 0.001 optimizer_params: - lr: 0.15 + lr: 0.21 batch_size: 3200 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/epsilon_5k_config.yaml b/examples/benchmark_quality/lr/epsilon_5k_config.yaml index 39144f4fdb..034d61378c 100644 --- a/examples/benchmark_quality/lr/epsilon_5k_config.yaml +++ b/examples/benchmark_quality/lr/epsilon_5k_config.yaml @@ -3,7 +3,7 @@ data_host: "epsilon_5k_hetero_host" idx: "id" label_name: "y" epochs: 8 -batch_size: 2500 +batch_size: 2200 init_param: fit_intercept: True method: "random" diff --git a/examples/benchmark_quality/lr/give_credit_config.yaml b/examples/benchmark_quality/lr/give_credit_config.yaml index dc041b48fe..480077d4ec 100644 --- a/examples/benchmark_quality/lr/give_credit_config.yaml +++ b/examples/benchmark_quality/lr/give_credit_config.yaml @@ -2,20 +2,20 @@ data_guest: "give_credit_hetero_guest" data_host: "give_credit_hetero_host" idx: "id" label_name: "y" -epochs: 6 +epochs: 12 init_param: fit_intercept: True method: "zeros" learning_rate_scheduler: method: "linear" scheduler_params: - factor: 0.7 + start_factor: 0.71 total_iters: 1000 optimizer: - method: "adam" + method: "rmsprop" penalty: "L2" - alpha: 10 + alpha: 0.01 optimizer_params: - lr: 0.2 + lr: 0.29 batch_size: 5500 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/lr_benchmark.yaml b/examples/benchmark_quality/lr/lr_benchmark.yaml index d7852909a2..63cb2603bd 100644 --- a/examples/benchmark_quality/lr/lr_benchmark.yaml +++ b/examples/benchmark_quality/lr/lr_benchmark.yaml @@ -179,15 +179,15 @@ data: table_name: vehicle_scale_hetero_host namespace: experiment role: host_0 -hetero_lr-binary-0-breast: - local: - script: "./sklearn-lr-binary.py" - conf: "./breast_lr_sklearn_config.yaml" - FATE-hetero-lr: - script: "./pipeline-lr-binary.py" - conf: "./breast_config.yaml" - compare_setting: - relative_tol: 0.01 +#hetero_lr-binary-0-breast: +# local: +# script: "./sklearn-lr-binary.py" +# conf: "./breast_lr_sklearn_config.yaml" +# FATE-hetero-lr: +# script: "./pipeline-lr-binary.py" +# conf: "./breast_config.yaml" +# compare_setting: +# relative_tol: 0.01 hetero_lr-binary-1-default-credit: local: script: "./sklearn-lr-binary.py" @@ -197,15 +197,15 @@ hetero_lr-binary-1-default-credit: conf: "./default_credit_config.yaml" compare_setting: relative_tol: 0.01 -hetero_lr-binary-2-epsilon-5k: - local: - script: "./sklearn-lr-binary.py" - conf: "./epsilon_5k_lr_sklearn_config.yaml" - FATE-hetero-lr: - script: "./pipeline-lr-binary.py" - conf: "./epsilon_5k_config.yaml" - compare_setting: - relative_tol: 0.01 +#hetero_lr-binary-2-epsilon-5k: +# local: +# script: "./sklearn-lr-binary.py" +# conf: "./epsilon_5k_lr_sklearn_config.yaml" +# FATE-hetero-lr: +# script: "./pipeline-lr-binary.py" +# conf: "./epsilon_5k_config.yaml" +# compare_setting: +# relative_tol: 0.01 hetero_lr-binary-3-give-credit: local: script: "./sklearn-lr-binary.py" diff --git a/examples/benchmark_quality/lr/pipeline-lr-multi.py b/examples/benchmark_quality/lr/pipeline-lr-multi.py index a598403238..ed3851e510 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-multi.py +++ b/examples/benchmark_quality/lr/pipeline-lr-multi.py @@ -71,6 +71,7 @@ def main(config="../../config.yaml", param="./vehicle_config.yaml", namespace="" input_model=lr_0.outputs["output_model"]) evaluation_0 = Evaluation('evaluation_0', + input_data=lr_0.outputs["train_output_data"], metrics=['multi_recall', 'multi_accuracy', 'multi_precision']) pipeline.add_task(psi_0) pipeline.add_task(lr_0) diff --git a/examples/benchmark_quality/lr/sklearn-lr-multi.py b/examples/benchmark_quality/lr/sklearn-lr-multi.py index ae931db9fb..b56fc80dce 100644 --- a/examples/benchmark_quality/lr/sklearn-lr-multi.py +++ b/examples/benchmark_quality/lr/sklearn-lr-multi.py @@ -42,10 +42,10 @@ def main(config="../../config.yaml", param="./vehicle_lr_sklearn_config.yaml"): config_param = { "penalty": param["penalty"], - "max_iter": param["max_iter"], + "max_iter": param["epochs"], "alpha": param["alpha"], "learning_rate": "optimal", - "eta0": param["learning_rate"], + "eta0": param["eta0"], "random_state": 105 } diff --git a/examples/pipeline/coordinated_lr/config.yaml b/examples/pipeline/config.yaml similarity index 100% rename from examples/pipeline/coordinated_lr/config.yaml rename to examples/pipeline/config.yaml diff --git a/examples/pipeline/coordinated_linr/coordinated_linr_testsuite.yaml b/examples/pipeline/coordinated_linr/coordinated_linr_testsuite.yaml new file mode 100644 index 0000000000..e3e319adb6 --- /dev/null +++ b/examples/pipeline/coordinated_linr/coordinated_linr_testsuite.yaml @@ -0,0 +1,60 @@ +data: + - file: examples/data/motor_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: motor_speed + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: motor_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/motor_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: motor_hetero_host + namespace: experiment + role: host_0 + - file: examples/data/motor_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: motor_hetero_host + namespace: experiment + role: host_1 +tasks: + normal-linr: + script: test_linr.py + linr-cv: + script: test_linr_cv.py + linr-warm-start: + script: test_linr_warm_start.py + linr-multi-host: + script: test_linr_multi_host.py diff --git a/examples/pipeline/coordinated_linr/test_linr.py b/examples/pipeline/coordinated_linr/test_linr.py new file mode 100644 index 0000000000..dbc47daeae --- /dev/null +++ b/examples/pipeline/coordinated_linr/test_linr.py @@ -0,0 +1,87 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLinR, PSI, Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="motor_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="motor_hetero_host", + namespace=f"experiment{namespace}")) + linr_0 = CoordinatedLinR("linr_0", + epochs=10, + batch_size=100, + optimizer={"method": "rmsprop", "optimizer_params": {"lr": 0.01}}, + init_param={"fit_intercept": True}, + train_data=psi_0.outputs["output_data"]) + evaluation_0 = Evaluation("evaluation_0", + label_column_name="motor_speed", + runtime_roles=["guest"], + default_eval_setting="regression", + input_data=linr_0.outputs["train_output_data"]) + + pipeline.add_task(psi_0) + pipeline.add_task(linr_0) + pipeline.add_task(evaluation_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + pipeline.deploy([psi_0, linr_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + deployed_pipeline.psi_0.guest.component_setting( + input_data=DataWarehouseChannel(name="motor_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting( + input_data=DataWarehouseChannel(name="motor_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/coordinated_linr/test_linr_cv.py b/examples/pipeline/coordinated_linr/test_linr_cv.py new file mode 100644 index 0000000000..ed33e0556a --- /dev/null +++ b/examples/pipeline/coordinated_linr/test_linr_cv.py @@ -0,0 +1,64 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLinR, PSI +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="motor_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="motor_hetero_host", + namespace=f"experiment{namespace}")) + linr_0 = CoordinatedLinR("linr_0", + epochs=10, + batch_size=None, + optimizer={"method": "sgd", "optimizer_params": {"lr": 0.01}}, + init_param={"fit_intercept": True}, + cv_data=psi_0.outputs["output_data"], + cv_param={"n_splits": 3}) + + pipeline.add_task(psi_0) + pipeline.add_task(linr_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/coordinated_linr/test_linr_multi_host.py b/examples/pipeline/coordinated_linr/test_linr_multi_host.py new file mode 100644 index 0000000000..cbf374b4e5 --- /dev/null +++ b/examples/pipeline/coordinated_linr/test_linr_multi_host.py @@ -0,0 +1,93 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, PSI +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host + arbiter = parties.arbiter[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="motor_hetero_guest", + namespace=f"{namespace}experiment")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="motor_hetero_host", + namespace=f"{namespace}experiment")) + psi_0.hosts[1].component_setting(input_data=DataWarehouseChannel(name="motor_hetero_host", + namespace=f"{namespace}experiment")) + lr_0 = CoordinatedLR("lr_0", + epochs=5, + batch_size=None, + early_stop="weight_diff", + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.1}}, + init_param={"fit_intercept": True, "method": "random_uniform"}, + train_data=psi_0.outputs["output_data"], + learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, + "total_iters": 100}}) + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="motor_speed", + runtime_roles=["guest"], + default_eval_setting="regression", + input_data=lr_0.outputs["train_output_data"]) + + pipeline.add_task(psi_0) + pipeline.add_task(lr_0) + pipeline.add_task(evaluation_0) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + pipeline.deploy([psi_0, lr_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + deployed_pipeline.psi_0.guest.component_setting( + input_data=DataWarehouseChannel(name="motor_hetero_guest", + namespace=f"{namespace}experiment")) + deployed_pipeline.psi_0.hosts[[0, 1]].component_setting( + input_data=DataWarehouseChannel(name="motor_hetero_host", + namespace=f"{namespace}experiment")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + # print(f"predict lr_0 data: {pipeline.get_task_info('lr_0').get_output_data()}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/coordinated_linr/test_linr_warm_start.py b/examples/pipeline/coordinated_linr/test_linr_warm_start.py new file mode 100644 index 0000000000..30f887254c --- /dev/null +++ b/examples/pipeline/coordinated_linr/test_linr_warm_start.py @@ -0,0 +1,95 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLinR, PSI +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="motor_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="motor_hetero_host", + namespace=f"experiment{namespace}")) + linr_0 = CoordinatedLinR("linr_0", + epochs=4, + batch_size=None, + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, + init_param={"fit_intercept": True, "method": "zeros"}, + train_data=psi_0.outputs["output_data"], + learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, + "total_iters": 100}}) + linr_1 = CoordinatedLinR("linr_1", train_data=psi_0.outputs["output_data"], + warm_start_model=linr_0.outputs["output_model"], + epochs=2, + batch_size=None, + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, + ) + + linr_2 = CoordinatedLinR("linr_2", epochs=6, + batch_size=None, + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, + init_param={"fit_intercept": True, "method": "zeros"}, + train_data=psi_0.outputs["output_data"], + learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, + "total_iters": 100}}) + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="motor_speed", + runtime_roles=["guest"], + default_eval_setting="regression", + input_data=[linr_1.outputs["train_output_data"], linr_2.outputs["train_output_data"]]) + + pipeline.add_task(psi_0) + pipeline.add_task(linr_0) + pipeline.add_task(linr_1) + pipeline.add_task(linr_2) + pipeline.add_task(evaluation_0) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + print(f"linr_1 model: {pipeline.get_task_info('linr_1').get_output_model()}") + # print(f"train linr_1 data: {pipeline.get_task_info('linr_1').get_output_data()}") + + print(f"linr_2 model: {pipeline.get_task_info('linr_2').get_output_model()}") + # print(f"train linr_2 data: {pipeline.get_task_info('linr_2').get_output_data()}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml b/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml index 029d8c6dfc..70de986820 100644 --- a/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml +++ b/examples/pipeline/coordinated_lr/coordinated_lr_testsuite.yaml @@ -33,10 +33,66 @@ data: table_name: breast_hetero_host namespace: experiment role: host_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host + namespace: experiment + role: host_1 + - file: "../../data/vehicle_scale_hetero_guest.csv" + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: "id" + match_id_range: 0 + label_type: int64 + label_name: y + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + head: true + partition: 4 + extend_sid: false + table_name: vehicle_scale_hetero_guest + namespace: experiment + role: guest_0 + - file: "../../data/vehicle_scale_hetero_host.csv" + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: "id" + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + head: true + partition: 4 + extend_sid: false + table_name: vehicle_scale_hetero_host + namespace: experiment + role: host_0 tasks: normal-lr: - script: test_lr_sid.py + script: test_lr.py lr-cv: - script: test_lr_sid_cv.py + script: test_lr_cv.py + lr-validate: + script: test_lr_validate.py lr-warm-start: - script: test_lr_sid_warm_start.py + script: test_lr_warm_start.py + lr-multi-class: + script: test_lr_multi_class.py + lr-multi-host: + script: test_lr_multi_host.py diff --git a/examples/pipeline/coordinated_lr/test_lr_sid.py b/examples/pipeline/coordinated_lr/test_lr.py similarity index 94% rename from examples/pipeline/coordinated_lr/test_lr_sid.py rename to examples/pipeline/coordinated_lr/test_lr.py index b13c24f8db..88b73ad325 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid.py +++ b/examples/pipeline/coordinated_lr/test_lr.py @@ -22,7 +22,7 @@ from fate_client.pipeline.utils import test_utils -def main(config="./config.yaml", namespace=""): +def main(config="../config.yaml", namespace=""): if isinstance(config, str): config = test_utils.load_job_config(config) parties = config.parties @@ -43,7 +43,7 @@ def main(config="./config.yaml", namespace=""): namespace=f"experiment{namespace}")) lr_0 = CoordinatedLR("lr_0", epochs=10, - batch_size=None, + batch_size=300, optimizer={"method": "SGD", "optimizer_params": {"lr": 0.21}}, init_param={"fit_intercept": True, "method": "random_uniform"}, train_data=psi_0.outputs["output_data"], @@ -86,9 +86,9 @@ def main(config="./config.yaml", namespace=""): if __name__ == "__main__": parser = argparse.ArgumentParser("PIPELINE DEMO") - parser.add_argument("-config", type=str, default="./config.yaml", + parser.add_argument("--config", type=str, default="../config.yaml", help="config file") - parser.add_argument("-namespace", type=str, default="", + parser.add_argument("--namespace", type=str, default="", help="namespace for data stored in FATE") args = parser.parse_args() main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/coordinated_lr/test_lr_sid_cv.py b/examples/pipeline/coordinated_lr/test_lr_cv.py similarity index 91% rename from examples/pipeline/coordinated_lr/test_lr_sid_cv.py rename to examples/pipeline/coordinated_lr/test_lr_cv.py index 8caffd245b..b981f005e1 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid_cv.py +++ b/examples/pipeline/coordinated_lr/test_lr_cv.py @@ -21,7 +21,7 @@ from fate_client.pipeline.utils import test_utils -def main(config="./config.yaml", namespace=""): +def main(config="../config.yaml", namespace=""): if isinstance(config, str): config = test_utils.load_job_config(config) parties = config.parties @@ -41,7 +41,7 @@ def main(config="./config.yaml", namespace=""): namespace=f"experiment{namespace}")) lr_0 = CoordinatedLR("lr_0", epochs=2, - batch_size=100, + batch_size=None, optimizer={"method": "sgd", "optimizer_params": {"lr": 0.01}}, init_param={"fit_intercept": True}, cv_data=psi_0.outputs["output_data"], @@ -56,9 +56,9 @@ def main(config="./config.yaml", namespace=""): if __name__ == "__main__": parser = argparse.ArgumentParser("PIPELINE DEMO") - parser.add_argument("-config", type=str, default="./config.yaml", + parser.add_argument("--config", type=str, default="../config.yaml", help="config file") - parser.add_argument("-namespace", type=str, default="", + parser.add_argument("--namespace", type=str, default="", help="namespace for data stored in FATE") args = parser.parse_args() main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/coordinated_lr/test_lr_multi_class.py b/examples/pipeline/coordinated_lr/test_lr_multi_class.py new file mode 100644 index 0000000000..9ce85fe3d9 --- /dev/null +++ b/examples/pipeline/coordinated_lr/test_lr_multi_class.py @@ -0,0 +1,94 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, PSI +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="vehicle_scale_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="vehicle_scale_hetero_host", + namespace=f"experiment{namespace}")) + lr_0 = CoordinatedLR("lr_0", + epochs=10, + batch_size=None, + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.21}}, + init_param={"fit_intercept": True, "method": "random_uniform"}, + train_data=psi_0.outputs["output_data"], + learning_rate_scheduler={"method": "linear", "scheduler_params": {"start_factor": 0.7, + "total_iters": 100}}) + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="y", + runtime_roles=["guest"], + default_eval_setting="multi", + input_data=lr_0.outputs["train_output_data"]) + + pipeline.add_task(psi_0) + pipeline.add_task(lr_0) + pipeline.add_task(evaluation_0) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + pipeline.deploy([psi_0, lr_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + deployed_pipeline.psi_0.guest.component_setting( + input_data=DataWarehouseChannel(name="vehicle_scale_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting( + input_data=DataWarehouseChannel(name="vehicle_scale_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + # print(f"predict lr_0 data: {pipeline.get_task_info('lr_0').get_output_data()}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/test_single_lr_multi_host.py b/examples/pipeline/coordinated_lr/test_lr_multi_host.py similarity index 74% rename from examples/pipeline/test_single_lr_multi_host.py rename to examples/pipeline/coordinated_lr/test_lr_multi_host.py index cd332ad64e..a94ff8afcf 100644 --- a/examples/pipeline/test_single_lr_multi_host.py +++ b/examples/pipeline/coordinated_lr/test_lr_multi_host.py @@ -16,13 +16,13 @@ import argparse from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, Intersection +from fate_client.pipeline.components.fate import CoordinatedLR, PSI from fate_client.pipeline.components.fate import Evaluation from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils -def main(config="./config.yaml", namespace=""): +def main(config="../config.yaml", namespace=""): if isinstance(config, str): config = test_utils.load_job_config(config) parties = config.parties @@ -32,20 +32,20 @@ def main(config="./config.yaml", namespace=""): pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) - intersect_0 = Intersection("intersect_0", method="raw") - intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace=f"{namespace}experiment_sid")) - intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"{namespace}experiment_sid")) - intersect_0.hosts[1].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"{namespace}experiment_sid")) + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"{namespace}experiment")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"{namespace}experiment")) + psi_0.hosts[1].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"{namespace}experiment")) lr_0 = CoordinatedLR("lr_0", - epochs=4, + epochs=5, batch_size=None, early_stop="weight_diff", - optimizer={"method": "SGD", "optimizer_params": {"lr": 0.01}}, - init_param={"fit_intercept": True, "method": "zeros"}, - train_data=intersect_0.outputs["output_data"], + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.1}}, + init_param={"fit_intercept": True, "method": "random_uniform"}, + train_data=psi_0.outputs["output_data"], learning_rate_scheduler={"method": "constant", "scheduler_params": {"factor": 1.0, "total_iters": 100}}) @@ -55,7 +55,7 @@ def main(config="./config.yaml", namespace=""): default_eval_setting="binary", input_data=lr_0.outputs["train_output_data"]) - pipeline.add_task(intersect_0) + pipeline.add_task(psi_0) pipeline.add_task(lr_0) pipeline.add_task(evaluation_0) @@ -63,17 +63,17 @@ def main(config="./config.yaml", namespace=""): print(pipeline.get_dag()) pipeline.fit() - pipeline.deploy([intersect_0, lr_0]) + pipeline.deploy([psi_0, lr_0]) predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - deployed_pipeline.intersect_0.guest.component_setting( + deployed_pipeline.psi_0.guest.component_setting( input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace=f"{namespace}experiment_sid")) - deployed_pipeline.intersect_0.hosts[[0, 1]].component_setting( - input_data=DataWarehouseChannel(name="breast_hetero_host_sid", - namespace=f"{namespace}experiment_sid")) + namespace=f"{namespace}experiment")) + deployed_pipeline.psi_0.hosts[[0, 1]].component_setting( + input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"{namespace}experiment")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() @@ -85,9 +85,9 @@ def main(config="./config.yaml", namespace=""): if __name__ == "__main__": parser = argparse.ArgumentParser("PIPELINE DEMO") - parser.add_argument("-config", type=str, default="./config.yaml", + parser.add_argument("--config", type=str, default="../config.yaml", help="config file") - parser.add_argument("-namespace", type=str, default="", + parser.add_argument("--namespace", type=str, default="", help="namespace for data stored in FATE") args = parser.parse_args() main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/coordinated_lr/test_lr_validate.py b/examples/pipeline/coordinated_lr/test_lr_validate.py new file mode 100644 index 0000000000..19c44e3903 --- /dev/null +++ b/examples/pipeline/coordinated_lr/test_lr_validate.py @@ -0,0 +1,80 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, PSI, DataSplit +from fate_client.pipeline.components.fate import Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + data_split_0 = DataSplit("data_split_0", + train_size=0.8, + validate_size=0.2, + input_data=psi_0.outputs["output_data"]) + lr_0 = CoordinatedLR("lr_0", + epochs=10, + batch_size=300, + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.21}}, + init_param={"fit_intercept": True, "method": "random_uniform"}, + train_data=data_split_0.outputs["train_output_data"], + validate_data=data_split_0.outputs["validate_output_data"], + learning_rate_scheduler={"method": "linear", "scheduler_params": {"start_factor": 0.7, + "total_iters": 100}}) + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="y", + runtime_roles=["guest"], + default_eval_setting="binary", + input_data=lr_0.outputs["train_output_data"]) + + pipeline.add_task(psi_0) + pipeline.add_task(lr_0) + pipeline.add_task(evaluation_0) + + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py b/examples/pipeline/coordinated_lr/test_lr_warm_start.py similarity index 95% rename from examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py rename to examples/pipeline/coordinated_lr/test_lr_warm_start.py index 25ba007959..5e554e837f 100644 --- a/examples/pipeline/coordinated_lr/test_lr_sid_warm_start.py +++ b/examples/pipeline/coordinated_lr/test_lr_warm_start.py @@ -22,7 +22,7 @@ from fate_client.pipeline.utils import test_utils -def main(config="./config.yaml", namespace=""): +def main(config="../config.yaml", namespace=""): if isinstance(config, str): config = test_utils.load_job_config(config) parties = config.parties @@ -73,6 +73,7 @@ def main(config="./config.yaml", namespace=""): pipeline.add_task(lr_0) pipeline.add_task(lr_1) pipeline.add_task(lr_2) + pipeline.add_task(evaluation_0) pipeline.compile() print(pipeline.get_dag()) @@ -86,9 +87,9 @@ def main(config="./config.yaml", namespace=""): if __name__ == "__main__": parser = argparse.ArgumentParser("PIPELINE DEMO") - parser.add_argument("-config", type=str, default="./config.yaml", + parser.add_argument("--config", type=str, default="../config.yaml", help="config file") - parser.add_argument("-namespace", type=str, default="", + parser.add_argument("--namespace", type=str, default="", help="namespace for data stored in FATE") args = parser.parse_args() main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/data_split/data_split_lr_testsuite.yaml b/examples/pipeline/data_split/data_split_lr_testsuite.yaml new file mode 100644 index 0000000000..468a41f510 --- /dev/null +++ b/examples/pipeline/data_split/data_split_lr_testsuite.yaml @@ -0,0 +1,40 @@ +data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host + namespace: experiment + role: host_0 +tasks: + data-split: + script: test_data_split.py + data-split-stratified: + script: test_data_split_stratified.py diff --git a/examples/pipeline/data_split/test_data_split.py b/examples/pipeline/data_split/test_data_split.py new file mode 100644 index 0000000000..484d26fa93 --- /dev/null +++ b/examples/pipeline/data_split/test_data_split.py @@ -0,0 +1,91 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import DataSplit, PSI +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + psi_1 = PSI("psi_1") + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + data_split_0 = DataSplit("data_split_0", + train_size=0.6, + validate_size=0.1, + test_size=None, + input_data=psi_0.outputs["output_data"]) + + data_split_1 = DataSplit("data_split_1", + train_size=200, + test_size=50, + input_data=psi_0.outputs["output_data"] + ) + + pipeline.add_task(psi_0) + pipeline.add_task(psi_1) + pipeline.add_task(data_split_0) + pipeline.add_task(data_split_1) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + # print(pipeline.get_task_info("data_split_0").get_output_data()) + """output_data = pipeline.get_task_info("data_split_0").get_output_data() + import pandas as pd + + print(f"data split 0 train size: {pd.DataFrame(output_data['train_output_data']).shape};" + f"validate size: {pd.DataFrame(output_data['validate_output_data']).shape}" + f"test size: {pd.DataFrame(output_data['test_output_data']).shape}") + output_data = pipeline.get_task_info("data_split_1").get_output_data() + print(f"data split 1train size: {pd.DataFrame(output_data['train_output_data']).shape};" + f"validate size: {pd.DataFrame(output_data['validate_output_data']).shape}" + f"test size: {pd.DataFrame(output_data['test_output_data']).shape}")""" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/data_split/test_data_split_stratified.py b/examples/pipeline/data_split/test_data_split_stratified.py new file mode 100644 index 0000000000..647d42ad63 --- /dev/null +++ b/examples/pipeline/data_split/test_data_split_stratified.py @@ -0,0 +1,94 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import DataSplit, PSI +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + psi_1 = PSI("psi_1") + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + Linear: 0.7 + data_split_0 = DataSplit("data_split_0", + train_size=0.6, + validate_size=0.0, + test_size=0.4, + stratified=True, + input_data=psi_0.outputs["output_data"]) + + data_split_1 = DataSplit("data_split_1", + train_size=200, + test_size=50, + stratified=True, + input_data=psi_0.outputs["output_data"] + ) + + pipeline.add_task(psi_0) + pipeline.add_task(psi_1) + pipeline.add_task(data_split_0) + pipeline.add_task(data_split_1) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + # print(pipeline.get_task_info("data_split_0").get_output_data()) + """output_data = pipeline.get_task_info("data_split_0").get_output_data() + import pandas as pd + + print(f"data split 0 train size: {pd.DataFrame(output_data['train_output_data']).shape};" + f"validate size: {pd.DataFrame(output_data['validate_output_data']).shape}" + f"test size: {pd.DataFrame(output_data['test_output_data']).shape}") + output_data = pipeline.get_task_info("data_split_1").get_output_data() + print(f"data split 1train size: {pd.DataFrame(output_data['train_output_data']).shape};" + f"validate size: {pd.DataFrame(output_data['validate_output_data']).shape}" + f"test size: {pd.DataFrame(output_data['test_output_data']).shape}")""" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/hetero_feature_binning/binning_testsuite.yaml b/examples/pipeline/hetero_feature_binning/binning_testsuite.yaml new file mode 100644 index 0000000000..2e9d95043e --- /dev/null +++ b/examples/pipeline/hetero_feature_binning/binning_testsuite.yaml @@ -0,0 +1,42 @@ +data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host + namespace: experiment + role: host_0 +tasks: + binning-bucket: + script: test_feature_binning_bucket.py + binning-quantile: + script: test_feature_binning_quantile.py + binning-asymmetric: + script: test_feature_binning_asymmetric.py diff --git a/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py b/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py new file mode 100644 index 0000000000..9b353527de --- /dev/null +++ b/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py @@ -0,0 +1,92 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import PSI, HeteroFeatureBinning +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + binning_0 = HeteroFeatureBinning("binning_0", + method="quantile", + n_bins=10, + train_data=psi_0.outputs["output_data"], + local_only=True + ) + binning_0.guest.component_setting(bin_col=["x0"], transform_method="bin_idx") + + binning_1 = HeteroFeatureBinning("binning_1", + transform_method="bin_idx", + method="quantile", + train_data=binning_0.outputs["train_output_data"]) + binning_1.guest.component_setting(category_col=["x0"], transform_method="woe") + + pipeline.add_task(psi_0) + pipeline.add_task(binning_0) + pipeline.add_task(binning_1) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + print(pipeline.get_task_info("binning_1").get_output_model()) + # print(pipeline.get_task_info("feature_scale_1").get_output_model()) + + pipeline.deploy([psi_0, binning_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py b/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py new file mode 100644 index 0000000000..f40c443070 --- /dev/null +++ b/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py @@ -0,0 +1,96 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import PSI, HeteroFeatureBinning +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + psi_1 = PSI("psi_1") + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + binning_0 = HeteroFeatureBinning("binning_0", + method="bucket", + n_bins=10, + transform_method="bin_idx", + train_data=psi_0.outputs["output_data"] + ) + binning_1 = HeteroFeatureBinning("binning_1", + transform_method="bin_idx", + input_model=binning_0.outputs["output_model"], + test_data=psi_1.outputs["output_data"]) + + pipeline.add_task(psi_0) + pipeline.add_task(psi_1) + pipeline.add_task(binning_0) + pipeline.add_task(binning_1) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + print(pipeline.get_task_info("binning_0").get_output_model()) + # print(pipeline.get_task_info("feature_scale_1").get_output_model()) + + pipeline.deploy([psi_0, binning_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py b/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py new file mode 100644 index 0000000000..8a0b9819a8 --- /dev/null +++ b/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py @@ -0,0 +1,91 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import PSI, HeteroFeatureBinning +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + binning_0 = HeteroFeatureBinning("binning_0", + method="quantile", + n_bins=10, + bin_col=["x0"], + transform_method="bin_idx", + train_data=psi_0.outputs["output_data"] + ) + binning_1 = HeteroFeatureBinning("binning_1", + transform_method="bin_idx", + method="quantile", + category_col=["x0"], + train_data=binning_0.outputs["train_output_data"]) + + pipeline.add_task(psi_0) + pipeline.add_task(binning_0) + pipeline.add_task(binning_1) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + print(pipeline.get_task_info("binning_1").get_output_model()) + # print(pipeline.get_task_info("feature_scale_1").get_output_model()) + + pipeline.deploy([psi_0, binning_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/hetero_feature_selection/selection_testsuite.yaml b/examples/pipeline/hetero_feature_selection/selection_testsuite.yaml new file mode 100644 index 0000000000..050dc39a14 --- /dev/null +++ b/examples/pipeline/hetero_feature_selection/selection_testsuite.yaml @@ -0,0 +1,44 @@ +data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host + namespace: experiment + role: host_0 +tasks: + selection-binning: + script: test_feature_selection_binning.py + selection-manual: + script: test_feature_selection_manual.py + binning-statistics: + script: test_feature_selection_statistics.py + binning-multi-model: + script: test_feature_selection_multi_model.py \ No newline at end of file diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py new file mode 100644 index 0000000000..95b06406a4 --- /dev/null +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py @@ -0,0 +1,88 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import PSI, HeteroFeatureSelection, HeteroFeatureBinning +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config=".../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + binning_0 = HeteroFeatureBinning("binning_0", + method="quantile", + n_bins=10, + bin_col=["x0"], + transform_method="bin_idx", + train_data=psi_0.outputs["output_data"] + ) + selection_0 = HeteroFeatureSelection("selection_0", + method=["iv"], + train_data=psi_0.outputs["output_data"], + input_models=[binning_0.outputs["output_model"]], + iv_param={"metrics": "iv", "filter_type": "threshold", "threshold": 0.1}) + + pipeline.add_task(psi_0) + pipeline.add_task(binning_0) + pipeline.add_task(selection_0) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + # print(pipeline.get_task_info("feature_scale_1").get_output_model()) + + pipeline.deploy([psi_0, selection_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py new file mode 100644 index 0000000000..ab4a7729de --- /dev/null +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py @@ -0,0 +1,80 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import PSI, HeteroFeatureSelection +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config=".../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + selection_0 = HeteroFeatureSelection("selection_0", + method=["statistics"], + train_data=psi_0.outputs["output_data"]) + selection_0.guest.component_setting(manual_param={"keep_col": ["x0", "x1"]}) + selection_0.hosts[0].component_setting(manual_param={"filter_out_col": ["x0", "x1"]}) + + pipeline.add_task(psi_0) + pipeline.add_task(selection_0) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + # print(pipeline.get_task_info("feature_scale_1").get_output_model()) + + pipeline.deploy([psi_0, selection_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py new file mode 100644 index 0000000000..551c1d81e7 --- /dev/null +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py @@ -0,0 +1,94 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import PSI, HeteroFeatureSelection, HeteroFeatureBinning, Statistics +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config=".../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + binning_0 = HeteroFeatureBinning("binning_0", + method="quantile", + n_bins=10, + bin_col=["x0"], + transform_method="bin_idx", + train_data=psi_0.outputs["output_data"] + ) + statistics_0 = Statistics("statistics_0", input_data=psi_0.outputs["output_data"]) + selection_0 = HeteroFeatureSelection("selection_0", + method=["iv", "statistics", "manual"], + train_data=psi_0.outputs["output_data"], + input_models=[binning_0.outputs["output_model"], + statistics_0.outputs["output_model"]], + iv_param={"metrics": "iv", "filter_type": "threshold", "threshold": 0.1}, + statistic_param={"metrics": ["max", "mean"], + "filter_type": "top_k", "threshold": 5}, + manual_param={"keep_col": ["x0", "x1"]} + ) + + pipeline.add_task(psi_0) + pipeline.add_task(binning_0) + pipeline.add_task(selection_0) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + # print(pipeline.get_task_info("feature_scale_1").get_output_model()) + + pipeline.deploy([psi_0, selection_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py new file mode 100644 index 0000000000..c614a89e93 --- /dev/null +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py @@ -0,0 +1,83 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import PSI, HeteroFeatureSelection, Statistics +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config=".../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + statistics_0 = Statistics("statistics_0", input_data=psi_0.outputs["output_data"]) + selection_0 = HeteroFeatureSelection("selection_0", + method=["statistics"], + train_data=psi_0.outputs["output_data"], + input_models=[statistics_0.outputs["output_model"]], + statistic_param={"metrics": ["max", "mean"], + "filter_type": "top_k", "threshold": 5}) + + pipeline.add_task(psi_0) + pipeline.add_task(statistics_0) + pipeline.add_task(selection_0) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + # print(pipeline.get_task_info("feature_scale_1").get_output_model()) + + pipeline.deploy([psi_0, selection_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/multi_model/test_multi.py b/examples/pipeline/multi_model/test_multi.py new file mode 100644 index 0000000000..3ea0424ef7 --- /dev/null +++ b/examples/pipeline/multi_model/test_multi.py @@ -0,0 +1,129 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import PSI, HeteroFeatureSelection, HeteroFeatureBinning, \ + FeatureScale, Union, DataSplit, CoordinatedLR, CoordinatedLinR, Statistics, Sample, Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + data_split_0 = DataSplit("data_split_0", input_data=psi_0.outputs["output_data"], + train_size=0.8, test_size=0.2, random_state=42) + union_0 = Union("union_0", input_data_list=[data_split_0.outputs["train_output_data"], + data_split_0.outputs["test_output_data"]]) + sample_0 = Sample("sample_0", input_data=data_split_0.outputs["train_output_data"], + n=800, replace=True, hetero_sync=True) + + binning_0 = HeteroFeatureBinning("binning_0", + method="quantile", + n_bins=10, + train_data=sample_0.outputs["output_data"] + ) + statistics_0 = Statistics("statistics_0", + input_data=psi_0.outputs["output_data"]) + selection_0 = HeteroFeatureSelection("selection_0", + method=["iv", "statistics"], + train_data=sample_0.outputs["output_data"], + input_models=[binning_0.outputs["output_model"], + statistics_0.outputs["output_model"]], + iv_param={"metrics": "iv", "filter_type": "threshold", "value": 0.1}, + statistic_param={"metrics": ["max", "min"], "filter_type": "top_k", + "threshold": 5}) + + selection_1 = HeteroFeatureSelection("selection_1", + input_model=selection_0.outputs["train_output_model"], + test_data=data_split_0.outputs["test_output_data"]) + + scale_0 = FeatureScale("scale_0", method="min_max", + train_data=selection_0.outputs["train_output_data"], ) + + lr_0 = CoordinatedLR("lr_0", train_data=selection_0.outputs["train_output_data"], + validate_data=selection_1.outputs["test_output_data"], epochs=3) + linr_0 = CoordinatedLR("linr_0", train_data=selection_0.outputs["train_output_data"], + validate_data=selection_1.outputs["test_output_data"], epochs=3) + + evaluation_0 = Evaluation("evaluation_0", input_data=lr_0.outputs["train_output_data"], + label_column_name="y", + runtime_roles=["guest"]) + evaluation_1 = Evaluation("evaluation_1", input_data=linr_0.outputs["train_output_data"], + default_eval_setting="regression", + label_column_name="y", + runtime_roles=["guest"]) + pipeline.add_task(psi_0) + pipeline.add_task(data_split_0) + pipeline.add_task(union_0) + pipeline.add_task(sample_0) + pipeline.add_task(binning_0) + pipeline.add_task(statistics_0) + pipeline.add_task(selection_0) + pipeline.add_task(scale_0) + pipeline.add_task(selection_1) + pipeline.add_task(lr_0) + pipeline.add_task(linr_0) + pipeline.add_task(evaluation_0) + pipeline.add_task(evaluation_1) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + # print(pipeline.get_task_info("feature_scale_1").get_output_model()) + + pipeline.deploy([psi_0, selection_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/sample/sample_testsuite.yaml b/examples/pipeline/sample/sample_testsuite.yaml new file mode 100644 index 0000000000..3df4d44118 --- /dev/null +++ b/examples/pipeline/sample/sample_testsuite.yaml @@ -0,0 +1,40 @@ +data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host + namespace: experiment + role: host_0 +tasks: + sample: + script: test_sample.py + sample-unilateral: + script: test_sample_unilateral.py diff --git a/examples/pipeline/sample/test_sample.py b/examples/pipeline/sample/test_sample.py new file mode 100644 index 0000000000..86fbf04a97 --- /dev/null +++ b/examples/pipeline/sample/test_sample.py @@ -0,0 +1,79 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import Sample, PSI +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + psi_1 = PSI("psi_1") + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + sample_0 = Sample("sample_0", + frac={0: 0.5}, + replace=False, + hetero_sync=True, + input_data=psi_0.outputs["output_data"]) + + sample_1 = Sample("sample_1", + n=100, + replace=False, + hetero_sync=True, + input_data=psi_0.outputs["output_data"] + ) + + pipeline.add_task(psi_0) + pipeline.add_task(psi_1) + pipeline.add_task(sample_0) + pipeline.add_task(sample_1) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/sample/test_sample_unilateral.py b/examples/pipeline/sample/test_sample_unilateral.py new file mode 100644 index 0000000000..8bdc9b3bef --- /dev/null +++ b/examples/pipeline/sample/test_sample_unilateral.py @@ -0,0 +1,80 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import Sample, PSI +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + psi_1 = PSI("psi_1") + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + sample_0 = Sample("sample_0", + runtime_roles=["guest"], + frac={0: 0.5}, + replace=False, + hetero_sync=False, + input_data=psi_0.outputs["output_data"]) + + sample_1 = Sample("sample_1", + runtime_roles=["host"], + n=1000, + replace=True, + hetero_sync=False, + input_data=psi_0.outputs["output_data"] + ) + + pipeline.add_task(psi_0) + pipeline.add_task(psi_1) + pipeline.add_task(sample_0) + pipeline.add_task(sample_1) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/scale/scale_testsuite.yaml b/examples/pipeline/scale/scale_testsuite.yaml new file mode 100644 index 0000000000..dfb9771821 --- /dev/null +++ b/examples/pipeline/scale/scale_testsuite.yaml @@ -0,0 +1,42 @@ +data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host + namespace: experiment + role: host_0 +tasks: + scale-min-max: + script: test_scale_min_max.py + scale-standard: + script: test_scale_standard.py + scale-with-lr: + script: test_scale_w_lr.py \ No newline at end of file diff --git a/examples/pipeline/scale/test_scale_min_max.py b/examples/pipeline/scale/test_scale_min_max.py new file mode 100644 index 0000000000..2ceb11bc70 --- /dev/null +++ b/examples/pipeline/scale/test_scale_min_max.py @@ -0,0 +1,99 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import PSI, FeatureScale, Statistics +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + psi_1 = PSI("psi_1") + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + feature_scale_0 = FeatureScale("feature_scale_0", + method="min_max", + feature_range={"x0": [-1, 1]}, + scale_col=["x0", "x1", "x3"], + train_data=psi_0.outputs["output_data"]) + + feature_scale_1 = FeatureScale("feature_scale_1", + test_data=psi_1.outputs["output_data"], + input_model=feature_scale_0.outputs["output_model"]) + + statistics_0 = Statistics("statistics_0", + metrics=["max", "min", "mean", "std"], + input_data=feature_scale_1.outputs["train_output_data"]) + + pipeline.add_task(psi_0) + pipeline.add_task(psi_1) + pipeline.add_task(feature_scale_0) + pipeline.add_task(feature_scale_1) + pipeline.add_task(statistics_0) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + print(pipeline.get_task_info("statistics_0").get_output_model()) + + pipeline.deploy([psi_0, feature_scale_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/scale/test_scale_standard.py b/examples/pipeline/scale/test_scale_standard.py new file mode 100644 index 0000000000..8bc7625334 --- /dev/null +++ b/examples/pipeline/scale/test_scale_standard.py @@ -0,0 +1,94 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, PSI, FeatureScale, Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + psi_1 = PSI("psi_1") + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + feature_scale_0 = FeatureScale("feature_scale_0", + method="standard", + train_data=psi_0.outputs["output_data"]) + + feature_scale_1 = FeatureScale("feature_scale_1", + test_data=psi_1.outputs["output_data"], + input_model=feature_scale_0.outputs["output_model"]) + + pipeline.add_task(psi_0) + pipeline.add_task(psi_1) + pipeline.add_task(feature_scale_0) + pipeline.add_task(feature_scale_1) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + print(pipeline.get_task_info("feature_scale_0").get_output_model()) + # print(pipeline.get_task_info("feature_scale_1").get_output_model()) + + pipeline.deploy([psi_0, feature_scale_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/scale/test_scale_w_lr.py b/examples/pipeline/scale/test_scale_w_lr.py new file mode 100644 index 0000000000..03390a95d4 --- /dev/null +++ b/examples/pipeline/scale/test_scale_w_lr.py @@ -0,0 +1,103 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import CoordinatedLR, PSI, FeatureScale, Evaluation +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + arbiter = parties.arbiter[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host, arbiter=arbiter) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + psi_1 = PSI("psi_1") + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + feature_scale_0 = FeatureScale("feature_scale_0", + method="standard", + train_data=psi_0.outputs["output_data"]) + + lr_0 = CoordinatedLR("lr_0", + epochs=10, + batch_size=None, + optimizer={"method": "SGD", "optimizer_params": {"lr": 0.21}}, + init_param={"fit_intercept": True, "method": "random_uniform"}, + train_data=feature_scale_0.outputs["train_output_data"], + learning_rate_scheduler={"method": "linear", "scheduler_params": {"start_factor": 0.7, + "total_iters": 100}}) + + evaluation_0 = Evaluation("evaluation_0", + label_column_name="y", + runtime_roles=["guest"], + default_eval_setting="binary", + input_data=lr_0.outputs["train_output_data"]) + + pipeline.add_task(psi_0) + pipeline.add_task(psi_1) + pipeline.add_task(feature_scale_0) + pipeline.add_task(lr_0) + pipeline.add_task(evaluation_0) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + pipeline.deploy([psi_0, feature_scale_0, lr_0]) + + predict_pipeline = FateFlowPipeline() + + deployed_pipeline = pipeline.get_deployed_pipeline() + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + predict_pipeline.add_task(deployed_pipeline) + predict_pipeline.compile() + # print("\n\n\n") + # print(predict_pipeline.compile().get_dag()) + predict_pipeline.predict() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/statistics/statistics_testsuite.yaml b/examples/pipeline/statistics/statistics_testsuite.yaml new file mode 100644 index 0000000000..c3d80416fe --- /dev/null +++ b/examples/pipeline/statistics/statistics_testsuite.yaml @@ -0,0 +1,38 @@ +data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host + namespace: experiment + role: host_0 +tasks: + statistics: + script: test_statistics.py diff --git a/examples/pipeline/statistics/test_statistics.py b/examples/pipeline/statistics/test_statistics.py new file mode 100644 index 0000000000..9a17395f2e --- /dev/null +++ b/examples/pipeline/statistics/test_statistics.py @@ -0,0 +1,61 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import PSI, Statistics +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config=".../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + + statistics_0 = Statistics("statistics_0", input_data=psi_0.outputs["output_data"], + metrics=["mean", "std", "min", "max"]) + + pipeline.add_task(psi_0) + pipeline.add_task(statistics_0) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/test_data_split.py b/examples/pipeline/test_data_split.py deleted file mode 100644 index a84dd4a3a5..0000000000 --- a/examples/pipeline/test_data_split.py +++ /dev/null @@ -1,68 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import DataSplit -from fate_client.pipeline.components.fate import Intersection -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -intersection_0 = Intersection("intersection_0", - method="raw") -intersection_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest_sid", - namespace="experiment")) -intersection_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host_sid", - namespace="experiment")) - -intersection_1 = Intersection("intersection_1", - method="raw") -intersection_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest_sid", - namespace="experiment")) -intersection_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host_sid", - namespace="experiment")) - -data_split_0 = DataSplit("data_split_0", - train_size=0.6, - validate_size=0.1, - test_size=None, - input_data=intersection_0.outputs["output_data"]) - -data_split_1 = DataSplit("data_split_1", - train_size=200, - test_size=50, - input_data=intersection_0.outputs["output_data"] - ) - -pipeline.add_task(intersection_0) -pipeline.add_task(intersection_1) -pipeline.add_task(data_split_0) -pipeline.add_task(data_split_1) - -# pipeline.add_task(hetero_feature_binning_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() - -# print(pipeline.get_task_info("data_split_0").get_output_data()) -output_data = pipeline.get_task_info("data_split_0").get_output_data() -import pandas as pd - -print(f"data split 0 train size: {pd.DataFrame(output_data['train_output_data']).shape};" - f"validate size: {pd.DataFrame(output_data['validate_output_data']).shape}" - f"test size: {pd.DataFrame(output_data['test_output_data']).shape}") -output_data = pipeline.get_task_info("data_split_1").get_output_data() -print(f"data split 1train size: {pd.DataFrame(output_data['train_output_data']).shape};" - f"validate size: {pd.DataFrame(output_data['validate_output_data']).shape}" - f"test size: {pd.DataFrame(output_data['test_output_data']).shape}") diff --git a/examples/pipeline/test_data_split_stratified.py b/examples/pipeline/test_data_split_stratified.py deleted file mode 100644 index 75fb4b9652..0000000000 --- a/examples/pipeline/test_data_split_stratified.py +++ /dev/null @@ -1,69 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import DataSplit -from fate_client.pipeline.components.fate import Intersection -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -intersection_0 = Intersection("intersection_0", - method="raw") -intersection_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) -intersection_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) - -intersection_1 = Intersection("intersection_1", - method="raw") -intersection_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) -intersection_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) - -data_split_0 = DataSplit("data_split_0", - train_size=0.6, - validate_size=0.0, - test_size=0.4, - stratified=True, - input_data=intersection_0.outputs["output_data"]) - -data_split_1 = DataSplit("data_split_1", - train_size=200, - test_size=50, - input_data=intersection_0.outputs["output_data"] - ) - -pipeline.add_task(intersection_0) -pipeline.add_task(intersection_1) -pipeline.add_task(data_split_0) -pipeline.add_task(data_split_1) - -# pipeline.add_task(hetero_feature_binning_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() - -# print(pipeline.get_task_info("data_split_0").get_output_data()) -output_data = pipeline.get_task_info("data_split_0").get_output_data() -import pandas as pd - -print(f"data split 0 train size: {pd.DataFrame(output_data['train_output_data']).shape};" - f"validate size: {pd.DataFrame(output_data['validate_output_data']).shape}" - f"test size: {pd.DataFrame(output_data['test_output_data']).shape}") -output_data = pipeline.get_task_info("data_split_1").get_output_data() -print(f"data split 1train size: {pd.DataFrame(output_data['train_output_data']).shape};" - f"validate size: {pd.DataFrame(output_data['validate_output_data']).shape}" - f"test size: {pd.DataFrame(output_data['test_output_data']).shape}") diff --git a/examples/pipeline/test_linr_sid_cv.py b/examples/pipeline/test_linr_sid_cv.py deleted file mode 100644 index a7e7d3a1e2..0000000000 --- a/examples/pipeline/test_linr_sid_cv.py +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLinR, Intersection -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -intersect_0 = Intersection("intersect_0", method="raw") -intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="motor_hetero_guest", - namespace="experiment_sid")) -intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="motor_hetero_host", - namespace="experiment_sid")) -linr_0 = CoordinatedLinR("linr_0", - epochs=2, - batch_size=100, - optimizer={"method": "sgd", "optimizer_params": {"lr": 0.2}}, - init_param={"fit_intercept": True}, - cv_data=intersect_0.outputs["output_data"], - cv_param={"n_splits": 3}) - -pipeline.add_task(intersect_0) -pipeline.add_task(linr_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() diff --git a/examples/pipeline/test_linr_sid_warm_start.py b/examples/pipeline/test_linr_sid_warm_start.py deleted file mode 100644 index 0fe2bdea06..0000000000 --- a/examples/pipeline/test_linr_sid_warm_start.py +++ /dev/null @@ -1,89 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLinR, Intersection -from fate_client.pipeline.components.fate import Evaluation -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -intersect_0 = Intersection("intersect_0", method="raw") -intersect_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment")) -intersect_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment")) -linr_0 = CoordinatedLinR("linr_0", - epochs=3, - batch_size=None, - optimizer={"method": "sgd", "optimizer_params": {"lr": 0.15}, "alpha": 0.1}, - init_param={"fit_intercept": True, "method": "zeros"}, - train_data=intersect_0.outputs["output_data"], - shuffle=False) -linr_1 = CoordinatedLinR("linr_1", train_data=intersect_0.outputs["output_data"], - warm_start_model=linr_0.outputs["output_model"], - epochs=2, - batch_size=None) -linr_2 = CoordinatedLinR("linr_2", - epochs=5, - batch_size=None, - optimizer={"method": "sgd", "optimizer_params": {"lr": 0.15}, "alpha": 0.1}, - init_param={"fit_intercept": True, "method": "zeros"}, - train_data=intersect_0.outputs["output_data"], - shuffle=False) - -"""linr_0.guest.component_setting(train_data=DataWarehouseChannel(name="breast_hetero_guest_sid", - namespace="experiment")) -linr_0.hosts[0].component_setting(train_data=DataWarehouseChannel(name="breast_hetero_host_sid", - namespace="experiment"))""" - -evaluation_0 = Evaluation("evaluation_0", - runtime_roles=["guest"], - metrics=["r2_score", "mse"], - label_column_name="y", - input_data=[linr_1.outputs["train_output_data"], linr_2.outputs["train_output_data"]]) - -# pipeline.add_task(feature_scale_0) -# pipeline.add_task(feature_scale_1) -pipeline.add_task(intersect_0) -pipeline.add_task(linr_0) -pipeline.add_task(linr_1) -pipeline.add_task(linr_2) -pipeline.add_task(evaluation_0) -# pipeline.add_task(hetero_feature_binning_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() -import numpy as np - -linr_0_coef = np.array( - pipeline.get_task_info('linr_0').get_output_model()["output_model"]["data"]['estimator']["param"]["coef_"]) -linr_0_intercept = np.array( - pipeline.get_task_info('linr_0').get_output_model()["output_model"]["data"]['estimator']["param"]["intercept_"]) - -linr_1_coef = np.array( - pipeline.get_task_info('linr_1').get_output_model()["output_model"]["data"]['estimator']["param"]["coef_"]) -linr_1_intercept = np.array( - pipeline.get_task_info('linr_1').get_output_model()["output_model"]["data"]['estimator']["param"]["intercept_"]) -# print(f"linr_1 data: {pipeline.get_task_info('linr_0').get_output_data()}") -linr_2_coef = np.array( - pipeline.get_task_info('linr_2').get_output_model()["output_model"]["data"]['estimator']["param"]["coef_"]) -linr_2_intercept = np.array( - pipeline.get_task_info('linr_2').get_output_model()["output_model"]["data"]['estimator']["param"]["intercept_"]) - -print(f"linr_1 coef: {linr_1_coef}, intercept: {linr_1_intercept}") -print(f"linr_2 coef: {linr_2_coef}, intercept: {linr_2_intercept}") -print(f"linr_1 vs l2_1 coef diff: {linr_1_coef - linr_2_coef}, intercept diff: {linr_1_intercept - linr_2_intercept}") - -print(f"\n evaluation result: {pipeline.get_task_info('evaluation_0').get_output_metric()[0]['data']}") diff --git a/examples/pipeline/test_sample.py b/examples/pipeline/test_sample.py deleted file mode 100644 index a6d54c08f2..0000000000 --- a/examples/pipeline/test_sample.py +++ /dev/null @@ -1,62 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import Intersection -from fate_client.pipeline.components.fate import Sample -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998") - -intersection_0 = Intersection("intersection_0", - method="raw") -intersection_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) -intersection_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) - -intersection_1 = Intersection("intersection_1", - method="raw") -intersection_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_sid")) -intersection_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_sid")) - -sample_0 = Sample("sample_0", - frac={0: 0.5}, - replace=False, - hetero_sync=True, - input_data=intersection_0.outputs["output_data"]) - -sample_1 = Sample("sample_1", - runtime_roles=["guest"], - n=1000, - replace=True, - hetero_sync=False, - input_data=intersection_0.outputs["output_data"] - ) - -pipeline.add_task(intersection_0) -pipeline.add_task(intersection_1) -pipeline.add_task(sample_0) -pipeline.add_task(sample_1) - -# pipeline.add_task(hetero_feature_binning_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() -output_data_0 = pipeline.get_task_info("sample_0").get_output_data() -output_data_1 = pipeline.get_task_info("sample_1").get_output_data() -print(f"sample 0: {output_data_0};" - f"sample 1: {output_data_1}") diff --git a/examples/pipeline/test_scale.py b/examples/pipeline/test_scale.py deleted file mode 100644 index 1b00541dd3..0000000000 --- a/examples/pipeline/test_scale.py +++ /dev/null @@ -1,72 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import FeatureScale -from fate_client.pipeline.components.fate import Intersection -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -intersection_0 = Intersection("intersection_0", - method="raw") -intersection_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment")) -intersection_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment")) - -intersection_1 = Intersection("intersection_1", - method="raw") -intersection_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment")) -intersection_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment")) - -feature_scale_0 = FeatureScale("feature_scale_0", - method="standard", - train_data=intersection_0.outputs["output_data"]) - -feature_scale_1 = FeatureScale("feature_scale_1", - test_data=intersection_1.outputs["output_data"], - input_model=feature_scale_0.outputs["output_model"]) - -pipeline.add_task(intersection_0) -pipeline.add_task(intersection_1) -pipeline.add_task(feature_scale_0) -pipeline.add_task(feature_scale_1) - -# pipeline.add_task(hetero_feature_binning_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() - -print(pipeline.get_task_info("feature_scale_0").get_output_model()) -# print(pipeline.get_task_info("feature_scale_1").get_output_model()) - -pipeline.deploy([intersection_0, feature_scale_0]) - -predict_pipeline = FateFlowPipeline() - -deployed_pipeline = pipeline.get_deployed_pipeline() -intersection_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment")) -intersection_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment")) - -predict_pipeline.add_task(deployed_pipeline) -predict_pipeline.compile() -# print("\n\n\n") -# print(predict_pipeline.compile().get_dag()) -predict_pipeline.predict() -print(predict_pipeline.get_task_info("feature_scale_0").get_output_model()) diff --git a/examples/pipeline/test_single_linr.py b/examples/pipeline/test_single_linr.py deleted file mode 100644 index ec58f83a78..0000000000 --- a/examples/pipeline/test_single_linr.py +++ /dev/null @@ -1,72 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLinR -from fate_client.pipeline.components.fate import Evaluation -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -"""feature_scale_0 = FeatureScale(name="feature_scale_0", - method="min_max", - train_data=intersection_0.outputs["output_data"]) - -feature_scale_1 = FeatureScale(name="feature_scale_1", - test_data=intersection_1.outputs["output_data"], - input_model=feature_scale_0.outputs["output_model"])""" - -linr_0 = CoordinatedLinR("linr_0", - epochs=10, - batch_size=None, - init_param={"fit_intercept": False}) - -linr_0.guest.component_setting(train_data=DataWarehouseChannel(name="motor_hetero_guest", - namespace="experiment")) -linr_0.hosts[0].component_setting(train_data=DataWarehouseChannel(name="motor_hetero_host", - namespace="experiment")) - -evaluation_0 = Evaluation("evaluation_0", - runtime_roles=["guest"], - input_data=linr_0.outputs["train_output_data"]) - -# pipeline.add_task(feature_scale_0) -# pipeline.add_task(feature_scale_1) -pipeline.add_task(linr_0) -pipeline.add_task(evaluation_0) -# pipeline.add_task(hetero_feature_binning_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() - -# print(pipeline.get_task_info("statistics_0").get_output_model()) -print(pipeline.get_task_info("linr_0").get_output_model()) -print(pipeline.get_task_info("linr_0").get_output_data()) -print(pipeline.get_task_info("evaluation_0").get_output_metrics()) - -pipeline.deploy([linr_0]) - -predict_pipeline = FateFlowPipeline() - -deployed_pipeline = pipeline.get_deployed_pipeline() -deployed_pipeline.linr_0.guest.component_setting(test_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment")) -deployed_pipeline.linr_0.hosts[0].component_setting(test_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment")) - -predict_pipeline.add_task(deployed_pipeline) -predict_pipeline.compile() -# print("\n\n\n") -# print(predict_pipeline.compile().get_dag()) -predict_pipeline.predict() diff --git a/examples/pipeline/test_single_lr.py b/examples/pipeline/test_single_lr.py deleted file mode 100644 index fb23747d3f..0000000000 --- a/examples/pipeline/test_single_lr.py +++ /dev/null @@ -1,71 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR -from fate_client.pipeline.components.fate import Evaluation -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -lr_0 = CoordinatedLR("lr_0", - epochs=10, - batch_size=100, - optimizer={"method": "sgd", "optimizer_params": {"lr": 0.1}, "alpha": 0.5}, - init_param={"fit_intercept": True}) -lr_1 = CoordinatedLR("lr_1", input_model=lr_0.outputs["output_model"], - test_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_64") - ) - -lr_0.guest.component_setting(train_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_64")) -lr_0.hosts[0].component_setting(train_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_64")) - -evaluation_0 = Evaluation("evaluation_0", - runtime_roles=["guest"], - input_data=lr_0.outputs["train_output_data"]) - -# pipeline.add_task(feature_scale_0) -# pipeline.add_task(feature_scale_1) -pipeline.add_task(lr_0) -pipeline.add_task(lr_1) -pipeline.add_task(evaluation_0) -# pipeline.add_task(hetero_feature_binning_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() - -# print(pipeline.get_task_info("statistics_0").get_output_model()) -print(pipeline.get_task_info("lr_0").get_output_model()) -print(pipeline.get_task_info("lr_0").get_output_metric()) -print(f"evaluation metric: ") -print(pipeline.get_task_info("evaluation_0").get_output_metric()) - -pipeline.deploy([lr_0]) - -predict_pipeline = FateFlowPipeline() - -deployed_pipeline = pipeline.get_deployed_pipeline() -deployed_pipeline.lr_0.guest.component_setting(test_data=DataWarehouseChannel(name="breast_hetero_guest_data", - namespace="experiment")) -deployed_pipeline.lr_0.hosts[0].component_setting(test_data=DataWarehouseChannel(name="breast_hetero_guest_data", - namespace="experiment")) - -predict_pipeline.add_task(deployed_pipeline) -predict_pipeline.compile() -# print("\n\n\n") -# print(predict_pipeline.compile().get_dag()) -predict_pipeline.predict() diff --git a/examples/pipeline/test_single_lr_multi.py b/examples/pipeline/test_single_lr_multi.py deleted file mode 100644 index 3dc1a6e41c..0000000000 --- a/examples/pipeline/test_single_lr_multi.py +++ /dev/null @@ -1,73 +0,0 @@ -# -# Copyright 2019 The FATE Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR -from fate_client.pipeline.components.fate import Evaluation -from fate_client.pipeline.interface import DataWarehouseChannel - -pipeline = FateFlowPipeline().set_roles(guest="9999", host="9998", arbiter="9998") - -"""feature_scale_0 = FeatureScale(name="feature_scale_0", - method="min_max", - train_data=intersection_0.outputs["output_data"]) - -feature_scale_1 = FeatureScale(name="feature_scale_1", - test_data=intersection_1.outputs["output_data"], - input_model=feature_scale_0.outputs["output_model"])""" - -lr_0 = CoordinatedLR("lr_0", - epochs=10, - batch_size=None, - init_param={"fit_intercept": False}) - -lr_0.guest.component_setting(train_data=DataWarehouseChannel(name="vehicle_scale_hetero_guest", - namespace="experiment_64")) -lr_0.hosts[0].component_setting(train_data=DataWarehouseChannel(name="vehicle_scale_hetero_guest", - namespace="experiment_64")) - -evaluation_0 = Evaluation("evaluation_0", - default_eval_metrics="multi", - runtime_roles=["guest"], - input_data=lr_0.outputs["train_output_data"]) - -# pipeline.add_task(feature_scale_0) -# pipeline.add_task(feature_scale_1) -pipeline.add_task(lr_0) -pipeline.add_task(evaluation_0) -# pipeline.add_task(hetero_feature_binning_0) -pipeline.compile() -print(pipeline.get_dag()) -pipeline.fit() - -# print(pipeline.get_task_info("statistics_0").get_output_model()) -print(pipeline.get_task_info("lr_0").get_output_model()) -print(pipeline.get_task_info("lr_0").get_output_data()) -print(pipeline.get_task_info("evaluation_0").get_output_metrics()) - -pipeline.deploy([lr_0]) - -predict_pipeline = FateFlowPipeline() - -deployed_pipeline = pipeline.get_deployed_pipeline() -deployed_pipeline.lr_0.guest.component_setting(test_data=DataWarehouseChannel(name="breast_hetero_guest", - namespace="experiment_64")) -deployed_pipeline.lr_0.hosts[0].component_setting(test_data=DataWarehouseChannel(name="breast_hetero_host", - namespace="experiment_64")) - -predict_pipeline.add_task(deployed_pipeline) -predict_pipeline.compile() -# print("\n\n\n") -# print(predict_pipeline.compile().get_dag()) -predict_pipeline.predict() diff --git a/examples/pipeline/union/test_union.py b/examples/pipeline/union/test_union.py new file mode 100644 index 0000000000..a1138117e1 --- /dev/null +++ b/examples/pipeline/union/test_union.py @@ -0,0 +1,81 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import DataSplit, PSI, Union +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + psi_1 = PSI("psi_1") + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + data_split_0 = DataSplit("data_split_0", + train_size=0.6, + validate_size=0.1, + input_data=psi_0.outputs["output_data"]) + + data_split_1 = DataSplit("data_split_1", + train_size=200, + test_size=50, + input_data=psi_0.outputs["output_data"] + ) + + union_0 = Union("union_0", input_data_list=[data_split_0.outputs["train_output_data"], + data_split_0.outputs["test_output_data"]]) + pipeline.add_task(psi_0) + pipeline.add_task(psi_1) + pipeline.add_task(data_split_0) + pipeline.add_task(data_split_1) + pipeline.add_task(union_0) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + print(pipeline.get_dag()) + pipeline.fit() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/union/union_testsuite.yaml b/examples/pipeline/union/union_testsuite.yaml new file mode 100644 index 0000000000..b5eab53a5b --- /dev/null +++ b/examples/pipeline/union/union_testsuite.yaml @@ -0,0 +1,38 @@ +data: + - file: examples/data/breast_hetero_guest.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + label_type: int64 + label_name: y + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_guest + namespace: experiment + role: guest_0 + - file: examples/data/breast_hetero_host.csv + meta: + delimiter: "," + dtype: float64 + input_format: dense + match_id_name: id + match_id_range: 0 + tag_value_delimiter: ":" + tag_with_value: false + weight_type: float64 + partitions: 4 + head: true + extend_sid: true + table_name: breast_hetero_host + namespace: experiment + role: host_0 +tasks: + union: + script: test_union.py diff --git a/python/fate/ml/glm/hetero/coordinated_lr/guest.py b/python/fate/ml/glm/hetero/coordinated_lr/guest.py index 1969bba4d2..d849aabeb6 100644 --- a/python/fate/ml/glm/hetero/coordinated_lr/guest.py +++ b/python/fate/ml/glm/hetero/coordinated_lr/guest.py @@ -157,7 +157,7 @@ def predict(self, ctx, test_data): for i, class_ctx in ctx.sub_ctx("class").ctxs_range(len(self.labels)): estimator = self.estimator[i] pred = estimator.predict(class_ctx, test_data) - pred_score[self.labels[i]] = pred + pred_score[str(self.labels[i])] = pred pred_df[predict_tools.PREDICT_SCORE] = pred_score.apply_row(lambda v: [list(v)]) predict_result = predict_tools.compute_predict_details( pred_df, task_type=predict_tools.MULTI, classes=self.labels diff --git a/python/fate_test/fate_test/scripts/data_cli.py b/python/fate_test/fate_test/scripts/data_cli.py index 7a09980dd2..0c5c58e455 100644 --- a/python/fate_test/fate_test/scripts/data_cli.py +++ b/python/fate_test/fate_test/scripts/data_cli.py @@ -8,11 +8,11 @@ from pathlib import Path import click -# from fate_test._client import Clients +from fate_test._client import Clients from fate_test._config import Config from fate_test._io import LOGGER, echo from fate_test.scripts._options import SharedOptions -from fate_test.scripts._utils import _load_testsuites, _delete_data, _big_data_task +from fate_test.scripts._utils import _load_testsuites, _delete_data, _big_data_task, _upload_data from ruamel import yaml from fate_test import _config @@ -28,14 +28,15 @@ def data_group(): @data_group.command("upload") @click.option('-i', '--include', required=False, type=click.Path(exists=True), multiple=True, metavar="", - help="include *benchmark.json under these paths") + help="include *benchmark.yaml under these paths") @click.option('-e', '--exclude', type=click.Path(exists=True), multiple=True, help="exclude *benchmark.json under these paths") @click.option("-t", "--config-type", type=click.Choice(["min_test", "all_examples"]), default="min_test", help="config file") @click.option('-g', '--glob', type=str, help="glob string to filter sub-directory of path specified by ") -@click.option('-s', '--suite-type', required=False, type=click.Choice(["testsuite", "benchmark"]), default="testsuite", +@click.option('-s', '--suite-type', required=False, type=click.Choice(["testsuite", "benchmark", "performance"]), + default="testsuite", help="suite type") @click.option('-r', '--role', type=str, default='all', help="role to process, default to `all`. " "use option likes: `guest_0`, `host_0`, `host`") @@ -56,9 +57,17 @@ def upload(ctx, include, exclude, glob, suite_type, role, config_type, **kwargs) yes = ctx.obj["yes"] echo.welcome() echo.echo(f"testsuite namespace: {namespace}", fg='red') + client = Clients(config_inst) if len(include) != 0: echo.echo("loading testsuites:") - suffix = "benchmark.json" if suite_type == "benchmark" else "testsuite.json" + if suite_type == "benchmark": + suffix = "benchmark.yaml" + elif suite_type == "testsuite": + suite_type = "testsuite.yaml" + elif suite_type == "performance": + suffix = "performance.yaml" + else: + raise ValueError(f"unknown suite type: {suite_type}") suites = _load_testsuites(includes=include, excludes=exclude, glob=glob, suffix=suffix, suite_type=suite_type) for suite in suites: @@ -67,8 +76,9 @@ def upload(ctx, include, exclude, glob, suite_type, role, config_type, **kwargs) echo.echo(f"\tdataset({len(suite.dataset)}) {suite.path}") if not yes and not click.confirm("running?"): return - # client_upload(suites=suites, config_inst=config_inst, namespace=namespace) - # todo: upload with pipeline + + for suite in suites: + _upload_data(client, suite, config_inst) else: config = get_config(config_inst) if config_type == 'min_test': @@ -77,14 +87,12 @@ def upload(ctx, include, exclude, glob, suite_type, role, config_type, **kwargs) config_file = config.all_examples_data_config with open(config_file, 'r', encoding='utf-8') as f: - upload_data = json.loads(f.read()) + upload_data = yaml.safe_load(f.read()) echo.echo(f"\tdataset({len(upload_data['data'])}) {config_file}") if not yes and not click.confirm("running?"): return - """with Clients(config_inst) as client: - data_upload(client, config_inst, upload_data)""" - # @todo: upload data with pipeline + _upload_data(client, upload_data, config_inst) echo.farewell() echo.echo(f"testsuite namespace: {namespace}", fg='red') @@ -121,9 +129,9 @@ def delete(ctx, include, exclude, glob, yes, suite_type, **kwargs): echo.echo(f"\tdataset({len(suite.dataset)}) {suite.path}") if not yes and not click.confirm("running?"): return - with Clients(config_inst) as client: - for i, suite in enumerate(suites): - _delete_data(client, suite) + client = Clients(config_inst) + for i, suite in enumerate(suites): + _delete_data(client, suite) echo.farewell() echo.echo(f"testsuite namespace: {namespace}", fg='red') @@ -200,11 +208,12 @@ def generate(ctx, include, host_data_type, encryption_type, match_rate, sparsity _big_data_task(include, guest_data_size, host_data_size, guest_feature_num, host_feature_num, host_data_type, config_inst, encryption_type, match_rate, sparsity, force, split_host, output_path, parallelize) if upload_data: - if use_local_data: + """if use_local_data: _config.use_local_data = 0 - _config.data_switch = remove_data - # client_upload(suites=suites, config_inst=config_inst, namespace=namespace, output_path=output_path) - # todo: upload with pipeline + _config.data_switch = remove_data""" + client = Clients(config_inst) + for suite in suites: + _upload_data(client, upload_data, config_inst) @data_group.command("download") @@ -265,6 +274,7 @@ def query_schema(ctx, component_name, job_id, role, party_id, **kwargs): if not yes and not click.confirm("running?"): return + client = Clients(config_inst) # todo: upload data with pipeline """with Clients(config_inst) as client: query_component_output_data(client, config_inst, component_name, job_id, role, party_id)""" From af8417c1e1c74ba9f84d93a86ea3a1a1d756d28c Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Thu, 10 Aug 2023 20:09:43 +0800 Subject: [PATCH 21/30] edit bq examples(#5008) Signed-off-by: Yu Wu --- examples/benchmark_quality/lr/pipeline-lr-multi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/benchmark_quality/lr/pipeline-lr-multi.py b/examples/benchmark_quality/lr/pipeline-lr-multi.py index ed3851e510..8088df2917 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-multi.py +++ b/examples/benchmark_quality/lr/pipeline-lr-multi.py @@ -71,6 +71,7 @@ def main(config="../../config.yaml", param="./vehicle_config.yaml", namespace="" input_model=lr_0.outputs["output_model"]) evaluation_0 = Evaluation('evaluation_0', + runtime_roles=['guest'], input_data=lr_0.outputs["train_output_data"], metrics=['multi_recall', 'multi_accuracy', 'multi_precision']) pipeline.add_task(psi_0) From d31f580f31e6c785e344c393121ad2c9931f5cb4 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Thu, 10 Aug 2023 20:19:53 +0800 Subject: [PATCH 22/30] rm unittest cli(#5008) Signed-off-by: Yu Wu --- python/fate_test/fate_test/scripts/cli.py | 4 +- .../fate_test/scripts/quick_test_cli.py | 95 ------------------- 2 files changed, 2 insertions(+), 97 deletions(-) delete mode 100644 python/fate_test/fate_test/scripts/quick_test_cli.py diff --git a/python/fate_test/fate_test/scripts/cli.py b/python/fate_test/fate_test/scripts/cli.py index 8dc444c7d8..f59bd6c4d4 100644 --- a/python/fate_test/fate_test/scripts/cli.py +++ b/python/fate_test/fate_test/scripts/cli.py @@ -22,7 +22,7 @@ from fate_test.scripts.data_cli import data_group # from fate_test.scripts.flow_test_cli import flow_group from fate_test.scripts.performance_cli import run_task -from fate_test.scripts.quick_test_cli import unittest_group +# from fate_test.scripts.quick_test_cli import unittest_group # from fate_test.scripts.secure_protocol_cli import secure_protocol_group from fate_test.scripts.testsuite_cli import run_suite @@ -32,7 +32,7 @@ "performance": run_task, "benchmark-quality": run_benchmark, "data": data_group, - "unittest": unittest_group + # "unittest": unittest_group } commands_alias = { diff --git a/python/fate_test/fate_test/scripts/quick_test_cli.py b/python/fate_test/fate_test/scripts/quick_test_cli.py deleted file mode 100644 index 08f95e9964..0000000000 --- a/python/fate_test/fate_test/scripts/quick_test_cli.py +++ /dev/null @@ -1,95 +0,0 @@ -import os -import subprocess - -import click -from fate_test._config import Config -from fate_test._io import echo -from fate_test.scripts._options import SharedOptions - - -@click.group(name="unittest") -def unittest_group(): - """ - unit test - """ - ... - - -@unittest_group.command("federatedml") -@click.option('-i', '--include', type=click.Path(exists=True), multiple=True, metavar="", - help="Specify federatedml test units for testing") -@SharedOptions.get_shared_options(hidden=True) -@click.pass_context -def unit_test(ctx, include, **kwargs): - """ - federatedml unit test - """ - ctx.obj.update(**kwargs) - ctx.obj.post_process() - namespace = ctx.obj["namespace"] - config_inst = ctx.obj["config"] - yes = ctx.obj["yes"] - echo.echo(f"testsuite namespace: {namespace}", fg='red') - - if not yes and not click.confirm("running?"): - return - - error_log_file = f"./logs/{namespace}/error_test.log" - os.makedirs(os.path.dirname(error_log_file), exist_ok=True) - run_test(includes=include, conf=config_inst, error_log_file=error_log_file) - - -def run_test(includes, conf: Config, error_log_file): - def error_log(stdout): - if stdout is None: - return os.path.abspath(error_log_file) - with open(error_log_file, "a") as f: - f.write(stdout) - - def run_test(file): - global failed_count - echo.echo("start to run test {}".format(file)) - try: - subp = subprocess.Popen(["python", file], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - stdout, stderr = subp.communicate() - stdout = stdout.decode("utf-8") - echo.echo(stdout) - if "FAILED" in stdout: - failed_count += 1 - error_log(stdout=f"error sequence {failed_count}: {file}") - error_log(stdout=stdout) - except Exception: - return - - def traverse_folder(file_fullname): - if os.path.isfile(file_fullname): - if "_test.py" in file_fullname and "ftl" not in file_fullname: - run_test(file_fullname) - else: - for file in os.listdir(file_fullname): - file_fullname_new = os.path.join(file_fullname, file) - if os.path.isdir(file_fullname_new): - traverse_folder(file_fullname_new) - if "_test.py" in file and ("/test" in file_fullname or "tests" in file_fullname): - if "ftl" in file_fullname_new: - continue - else: - run_test(file_fullname_new) - - global failed_count - failed_count = 0 - fate_base = conf.fate_base - ml_dir = os.path.join(fate_base, "python/federatedml") - PYTHONPATH = os.environ.get('PYTHONPATH') + ":" + os.path.join(fate_base, "python") - os.environ['PYTHONPATH'] = PYTHONPATH - if len(includes) == 0: - traverse_folder(ml_dir) - else: - ml_dir = includes - for v in ml_dir: - traverse_folder(os.path.abspath(v)) - - echo.echo(f"there are {failed_count} failed test") - if failed_count > 0: - print('Please check the error content: {}'.format(error_log(None))) From 8dd7c033bf7f42305ba222a9450e5c2f86c620f6 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Fri, 11 Aug 2023 10:11:19 +0800 Subject: [PATCH 23/30] fix multi lr bq pipeline script(#5008) Signed-off-by: Yu Wu --- examples/benchmark_quality/lr/default_credit_config.yaml | 4 ++-- examples/benchmark_quality/lr/give_credit_config.yaml | 6 +++--- examples/benchmark_quality/lr/pipeline-lr-multi.py | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/benchmark_quality/lr/default_credit_config.yaml b/examples/benchmark_quality/lr/default_credit_config.yaml index b547c333b9..dacc80dcd2 100644 --- a/examples/benchmark_quality/lr/default_credit_config.yaml +++ b/examples/benchmark_quality/lr/default_credit_config.yaml @@ -2,7 +2,7 @@ data_guest: "default_credit_hetero_guest" data_host: "default_credit_hetero_host" idx: "id" label_name: "y" -epochs: 30 +epochs: 20 init_param: fit_intercept: True method: "zeros" @@ -17,6 +17,6 @@ optimizer: penalty: "L2" alpha: 0.001 optimizer_params: - lr: 0.21 + lr: 0.17 batch_size: 3200 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/give_credit_config.yaml b/examples/benchmark_quality/lr/give_credit_config.yaml index 480077d4ec..f5e47fcc76 100644 --- a/examples/benchmark_quality/lr/give_credit_config.yaml +++ b/examples/benchmark_quality/lr/give_credit_config.yaml @@ -2,7 +2,7 @@ data_guest: "give_credit_hetero_guest" data_host: "give_credit_hetero_host" idx: "id" label_name: "y" -epochs: 12 +epochs: 16 init_param: fit_intercept: True method: "zeros" @@ -13,9 +13,9 @@ learning_rate_scheduler: total_iters: 1000 optimizer: method: "rmsprop" - penalty: "L2" + penalty: "L1" alpha: 0.01 optimizer_params: - lr: 0.29 + lr: 0.25 batch_size: 5500 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/pipeline-lr-multi.py b/examples/benchmark_quality/lr/pipeline-lr-multi.py index 8088df2917..463d3cc91a 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-multi.py +++ b/examples/benchmark_quality/lr/pipeline-lr-multi.py @@ -72,6 +72,7 @@ def main(config="../../config.yaml", param="./vehicle_config.yaml", namespace="" evaluation_0 = Evaluation('evaluation_0', runtime_roles=['guest'], + label_column_name=param.get("label_name"), input_data=lr_0.outputs["train_output_data"], metrics=['multi_recall', 'multi_accuracy', 'multi_precision']) pipeline.add_task(psi_0) From a57015133139294f3652a1853ffe200423b8f3bb Mon Sep 17 00:00:00 2001 From: mgqa34 Date: Fri, 11 Aug 2023 13:33:09 +0800 Subject: [PATCH 24/30] dataframe: fix where op, _sample_util bug Signed-off-by: mgqa34 --- .../arch/dataframe/ops/_dimension_scaling.py | 24 ++++++++++++------- python/fate/arch/dataframe/ops/_where.py | 18 ++++++++++---- python/fate/arch/dataframe/utils/_sample.py | 2 +- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/python/fate/arch/dataframe/ops/_dimension_scaling.py b/python/fate/arch/dataframe/ops/_dimension_scaling.py index 8f75901874..81822752aa 100644 --- a/python/fate/arch/dataframe/ops/_dimension_scaling.py +++ b/python/fate/arch/dataframe/ops/_dimension_scaling.py @@ -148,7 +148,7 @@ def _align_blocks(blocks, align_fields_loc=None, full_block_migrate_set=None, ds r_flatten = r_block_table.mapPartitions(r_flatten_func, use_previous_behavior=False) l_flatten = l_flatten.union(r_flatten) - partition_order_mappings = get_partition_order_by_raw_table(l_flatten) + partition_order_mappings = get_partition_order_by_raw_table(l_flatten, data_manager.block_row_size) _convert_to_block_func = functools.partial(to_blocks, dm=data_manager, partition_mappings=partition_order_mappings) block_table = l_flatten.mapPartitions(_convert_to_block_func, use_previous_behavior=False) block_table, data_manager = compress_blocks(block_table, data_manager) @@ -187,7 +187,9 @@ def drop(df: "DataFrame", index: "DataFrame" = None) -> "DataFrame": r_flatten_table = index.block_table.mapPartitions(r_flatten_func, use_previous_behavior=False) drop_flatten = l_flatten_table.subtractByKey(r_flatten_table) - partition_order_mappings = get_partition_order_by_raw_table(drop_flatten) if drop_flatten.count() else dict() + partition_order_mappings = get_partition_order_by_raw_table( + drop_flatten, data_manager.block_row_size + ) if drop_flatten.count() else dict() _convert_to_block_func = functools.partial(to_blocks, dm=data_manager, @@ -286,16 +288,20 @@ def _flatten_partition(kvs, block_num=0): def to_blocks(kvs, dm: DataManager = None, partition_mappings: dict = None): - ret_blocks = [[] for i in range(dm.block_num)] + ret_blocks = [[] for _ in range(dm.block_num)] - partition_id = None - for sample_id, value in kvs: - if partition_id is None: - partition_id = partition_mappings[sample_id]["block_id"] + block_id = None + for lid, (sample_id, value) in enumerate(kvs): + if block_id is None: + block_id = partition_mappings[sample_id]["start_block_id"] ret_blocks[0].append(sample_id) for bid, buf in enumerate(value): ret_blocks[bid + 1].append(buf) - ret_blocks = dm.convert_to_blocks(ret_blocks) + if (lid + 1) % dm.block_row_size == 0: + yield block_id, dm.convert_to_blocks(ret_blocks) + ret_blocks = [[] for i in range(dm.block_num)] + block_id += 1 - return [(partition_id, ret_blocks)] + if ret_blocks[0]: + yield block_id, dm.convert_to_blocks(ret_blocks) diff --git a/python/fate/arch/dataframe/ops/_where.py b/python/fate/arch/dataframe/ops/_where.py index 96a195bcbd..b04aaea9d2 100644 --- a/python/fate/arch/dataframe/ops/_where.py +++ b/python/fate/arch/dataframe/ops/_where.py @@ -22,6 +22,16 @@ def where(df: DataFrame, other: DataFrame): + """ + df[mask]触发该操作 + a. mask的列可能于df不一致,这个时候,df在mask中不出现的列均为nan + (1) columns完全对等 + (2) columns一致,但顺序不一致 + (3) mask columns数少于df columns数 + b. 当mask中某一列有false的时候,需要考虑类型问题:如果原类型为int/bool等,需要上升为float32,如果为float32,保持不变 + (1) mask 计算哪些列出现False,提前做列类型对齐 + c. 要求df与mask的key是一致的 + """ if df.shape[0] != other.shape[0]: raise ValueError("Row numbers should be identical.") @@ -106,7 +116,7 @@ def _where_float_type(l_block_table, r_block_table, def __convert_na(l_blocks, r_blocks): ret_blocks = [] - for block in ret_blocks: + for block in l_blocks: if isinstance(block, torch.Tensor): ret_blocks.append(block.clone()) elif isinstance(block, np.ndarray): @@ -115,10 +125,10 @@ def __convert_na(l_blocks, r_blocks): ret_blocks.append(block) for (l_bid, l_offset), (r_bid, r_offset) in zip(l_loc_info, r_loc_info): - if isinstance(ret_blocks[l_blocks], torch.Tensor): - ret_blocks[l_bid][:, l_offset][~r_blocks[r_bid][: r_offset]] = torch.nan + if isinstance(ret_blocks[l_bid], torch.Tensor): + ret_blocks[l_bid][:, l_offset][~r_blocks[r_bid][:, r_offset]] = torch.nan else: - ret_blocks[l_bid][:, l_offset][~r_blocks[r_bid][: r_offset]] = np.nan + ret_blocks[l_bid][:, l_offset][~r_blocks[r_bid][:, r_offset]] = np.nan return ret_blocks diff --git a/python/fate/arch/dataframe/utils/_sample.py b/python/fate/arch/dataframe/utils/_sample.py index 1887b45de6..9d4ca78d70 100644 --- a/python/fate/arch/dataframe/utils/_sample.py +++ b/python/fate/arch/dataframe/utils/_sample.py @@ -186,7 +186,7 @@ def _convert_raw_table_to_df( ): from ..ops._indexer import get_partition_order_by_raw_table from ..ops._dimension_scaling import to_blocks - partition_order_mapping = get_partition_order_by_raw_table(table) + partition_order_mapping = get_partition_order_by_raw_table(table, data_manager.block_row_size) to_block_func = functools.partial(to_blocks, dm=data_manager, partition_mappings=partition_order_mapping) block_table = table.mapPartitions(to_block_func, use_previous_behavior=False) From 5f6ee9d19668fffb15044462754a3d3102ce4de5 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Fri, 11 Aug 2023 14:10:41 +0800 Subject: [PATCH 25/30] make input models optional for hetero feature selection(#4661) edit pipeline examples(#5008) Signed-off-by: Yu Wu --- .../test_feature_binning_asymmetric.py | 10 +++---- .../test_feature_binning_bucket.py | 12 ++++---- .../test_feature_binning_quantile.py | 10 +++---- .../selection_testsuite.yaml | 4 +-- .../test_feature_selection_binning.py | 11 ++++--- .../test_feature_selection_manual.py | 12 ++++---- .../test_feature_selection_multi_model.py | 11 ++++--- .../test_feature_selection_statistics.py | 10 +++---- examples/pipeline/multi_model/test_multi.py | 6 ++-- examples/pipeline/scale/test_scale_min_max.py | 14 ++++----- .../pipeline/scale/test_scale_standard.py | 14 ++++----- examples/pipeline/scale/test_scale_w_lr.py | 12 ++++---- .../pipeline/statistics/test_statistics.py | 2 +- .../components/hetero_feature_selection.py | 30 +++++++++---------- 14 files changed, 78 insertions(+), 80 deletions(-) diff --git a/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py b/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py index 9b353527de..2662615184 100644 --- a/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py +++ b/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py @@ -36,7 +36,7 @@ def main(config="../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) binning_0 = HeteroFeatureBinning("binning_0", @@ -70,10 +70,10 @@ def main(config="../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py b/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py index f40c443070..fae56d4dc4 100644 --- a/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py +++ b/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py @@ -36,11 +36,11 @@ def main(config="../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) psi_1 = PSI("psi_1") - psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) @@ -74,10 +74,10 @@ def main(config="../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py b/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py index 8a0b9819a8..727f622089 100644 --- a/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py +++ b/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py @@ -36,7 +36,7 @@ def main(config="../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) binning_0 = HeteroFeatureBinning("binning_0", @@ -69,10 +69,10 @@ def main(config="../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/hetero_feature_selection/selection_testsuite.yaml b/examples/pipeline/hetero_feature_selection/selection_testsuite.yaml index 050dc39a14..5d0778dcb1 100644 --- a/examples/pipeline/hetero_feature_selection/selection_testsuite.yaml +++ b/examples/pipeline/hetero_feature_selection/selection_testsuite.yaml @@ -38,7 +38,7 @@ tasks: script: test_feature_selection_binning.py selection-manual: script: test_feature_selection_manual.py - binning-statistics: + selection-statistics: script: test_feature_selection_statistics.py - binning-multi-model: + selection-multi-model: script: test_feature_selection_multi_model.py \ No newline at end of file diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py index 95b06406a4..0e969544f9 100644 --- a/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py @@ -36,13 +36,12 @@ def main(config=".../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) binning_0 = HeteroFeatureBinning("binning_0", method="quantile", n_bins=10, - bin_col=["x0"], transform_method="bin_idx", train_data=psi_0.outputs["output_data"] ) @@ -68,10 +67,10 @@ def main(config=".../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py index ab4a7729de..722bb36c18 100644 --- a/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py @@ -36,11 +36,11 @@ def main(config=".../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) selection_0 = HeteroFeatureSelection("selection_0", - method=["statistics"], + method=["manual"], train_data=psi_0.outputs["output_data"]) selection_0.guest.component_setting(manual_param={"keep_col": ["x0", "x1"]}) selection_0.hosts[0].component_setting(manual_param={"filter_out_col": ["x0", "x1"]}) @@ -60,10 +60,10 @@ def main(config=".../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py index 551c1d81e7..b0dc8440ea 100644 --- a/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py @@ -36,13 +36,12 @@ def main(config=".../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) binning_0 = HeteroFeatureBinning("binning_0", method="quantile", n_bins=10, - bin_col=["x0"], transform_method="bin_idx", train_data=psi_0.outputs["output_data"] ) @@ -74,10 +73,10 @@ def main(config=".../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py index c614a89e93..bb3a3c9839 100644 --- a/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py @@ -36,7 +36,7 @@ def main(config=".../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) statistics_0 = Statistics("statistics_0", input_data=psi_0.outputs["output_data"]) @@ -63,10 +63,10 @@ def main(config=".../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/multi_model/test_multi.py b/examples/pipeline/multi_model/test_multi.py index 3ea0424ef7..25f2a4b9d2 100644 --- a/examples/pipeline/multi_model/test_multi.py +++ b/examples/pipeline/multi_model/test_multi.py @@ -16,7 +16,7 @@ from fate_client.pipeline import FateFlowPipeline from fate_client.pipeline.components.fate import PSI, HeteroFeatureSelection, HeteroFeatureBinning, \ - FeatureScale, Union, DataSplit, CoordinatedLR, CoordinatedLinR, Statistics, Sample, Evaluation + FeatureScale, Union, DataSplit, CoordinatedLR, Statistics, Sample, Evaluation from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils @@ -38,7 +38,7 @@ def main(config="../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) data_split_0 = DataSplit("data_split_0", input_data=psi_0.outputs["output_data"], @@ -109,7 +109,7 @@ def main(config="../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) diff --git a/examples/pipeline/scale/test_scale_min_max.py b/examples/pipeline/scale/test_scale_min_max.py index 2ceb11bc70..71f12abab5 100644 --- a/examples/pipeline/scale/test_scale_min_max.py +++ b/examples/pipeline/scale/test_scale_min_max.py @@ -36,11 +36,11 @@ def main(config="../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) psi_1 = PSI("psi_1") - psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) @@ -57,7 +57,7 @@ def main(config="../config.yaml", namespace=""): statistics_0 = Statistics("statistics_0", metrics=["max", "min", "mean", "std"], - input_data=feature_scale_1.outputs["train_output_data"]) + input_data=feature_scale_1.outputs["test_output_data"]) pipeline.add_task(psi_0) pipeline.add_task(psi_1) @@ -77,10 +77,10 @@ def main(config="../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/scale/test_scale_standard.py b/examples/pipeline/scale/test_scale_standard.py index 8bc7625334..008e7c2a75 100644 --- a/examples/pipeline/scale/test_scale_standard.py +++ b/examples/pipeline/scale/test_scale_standard.py @@ -15,7 +15,7 @@ import argparse from fate_client.pipeline import FateFlowPipeline -from fate_client.pipeline.components.fate import CoordinatedLR, PSI, FeatureScale, Evaluation +from fate_client.pipeline.components.fate import PSI, FeatureScale from fate_client.pipeline.interface import DataWarehouseChannel from fate_client.pipeline.utils import test_utils @@ -37,11 +37,11 @@ def main(config="../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) psi_1 = PSI("psi_1") - psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) @@ -72,10 +72,10 @@ def main(config="../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/scale/test_scale_w_lr.py b/examples/pipeline/scale/test_scale_w_lr.py index 03390a95d4..2a06ed5e00 100644 --- a/examples/pipeline/scale/test_scale_w_lr.py +++ b/examples/pipeline/scale/test_scale_w_lr.py @@ -37,11 +37,11 @@ def main(config="../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) psi_1 = PSI("psi_1") - psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) @@ -81,10 +81,10 @@ def main(config="../config.yaml", namespace=""): predict_pipeline = FateFlowPipeline() deployed_pipeline = pipeline.get_deployed_pipeline() - psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", - namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + deployed_pipeline.psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) predict_pipeline.add_task(deployed_pipeline) predict_pipeline.compile() diff --git a/examples/pipeline/statistics/test_statistics.py b/examples/pipeline/statistics/test_statistics.py index 9a17395f2e..e5e7605856 100644 --- a/examples/pipeline/statistics/test_statistics.py +++ b/examples/pipeline/statistics/test_statistics.py @@ -36,7 +36,7 @@ def main(config=".../config.yaml", namespace=""): psi_0 = PSI("psi_0") psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", namespace=f"experiment{namespace}")) - psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) statistics_0 = Statistics("statistics_0", input_data=psi_0.outputs["output_data"], diff --git a/python/fate/components/components/hetero_feature_selection.py b/python/fate/components/components/hetero_feature_selection.py index 9291965205..8e23a91089 100644 --- a/python/fate/components/components/hetero_feature_selection.py +++ b/python/fate/components/components/hetero_feature_selection.py @@ -29,20 +29,20 @@ def hetero_feature_selection(ctx, role): @hetero_feature_selection.train() def train( - ctx: Context, - role: Role, - train_data: cpn.dataframe_input(roles=[GUEST, HOST]), - input_models: cpn.json_model_inputs(roles=[GUEST, HOST]), - method: cpn.parameter( - type=List[params.string_choice(["manual", "iv", "statistics"])], - default=["manual"], - optional=False, - desc="selection method, options: {manual, binning, statistics}", - ), - select_col: cpn.parameter( - type=List[str], - default=None, - desc="list of column names to be selected, if None, all columns will be considered", + ctx: Context, + role: Role, + train_data: cpn.dataframe_input(roles=[GUEST, HOST]), + input_models: cpn.json_model_inputs(roles=[GUEST, HOST], optional=True), + method: cpn.parameter( + type=List[params.string_choice(["manual", "iv", "statistics"])], + default=["manual"], + optional=False, + desc="selection method, options: {manual, binning, statistics}", + ), + select_col: cpn.parameter( + type=List[str], + default=None, + desc="list of column names to be selected, if None, all columns will be considered", ), iv_param: cpn.parameter( type=params.iv_filter_param(), @@ -105,7 +105,7 @@ def train( # temp code end # logger.info(f"input_models: {input_models}, len: {len(input_models)}") - input_iso_models = [model.read() for model in input_models] + input_iso_models = [model.read() for model in input_models] if input_models is not None else None # logger.info(f"read in input_models len: {len(input_iso_models)}; \n read in input models: {input_iso_models}") if role.is_guest: selection = HeteroSelectionModuleGuest( From be08926892985cb73d3eb881ed05d096d8714df7 Mon Sep 17 00:00:00 2001 From: mgqa34 Date: Fri, 11 Aug 2023 16:09:08 +0800 Subject: [PATCH 26/30] dataframe: fix retrieval row api by passing block_row_size Signed-off-by: mgqa34 --- python/fate/arch/dataframe/ops/_dimension_scaling.py | 2 +- python/fate/arch/dataframe/ops/_where.py | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/python/fate/arch/dataframe/ops/_dimension_scaling.py b/python/fate/arch/dataframe/ops/_dimension_scaling.py index 81822752aa..d4c4d39ea0 100644 --- a/python/fate/arch/dataframe/ops/_dimension_scaling.py +++ b/python/fate/arch/dataframe/ops/_dimension_scaling.py @@ -256,7 +256,7 @@ def _retrieval(blocks, t: torch.Tensor): if retrieval_raw_table.count() == 0: return df.empty_frame() - partition_order_mappings = get_partition_order_by_raw_table(retrieval_raw_table) + partition_order_mappings = get_partition_order_by_raw_table(retrieval_raw_table, df.data_manager.block_row_size) to_blocks_func = functools.partial(to_blocks, dm=df.data_manager, partition_mappings=partition_order_mappings) block_table = retrieval_raw_table.mapPartitions(to_blocks_func, use_previous_behavior=False) diff --git a/python/fate/arch/dataframe/ops/_where.py b/python/fate/arch/dataframe/ops/_where.py index b04aaea9d2..e3e3308c51 100644 --- a/python/fate/arch/dataframe/ops/_where.py +++ b/python/fate/arch/dataframe/ops/_where.py @@ -22,16 +22,6 @@ def where(df: DataFrame, other: DataFrame): - """ - df[mask]触发该操作 - a. mask的列可能于df不一致,这个时候,df在mask中不出现的列均为nan - (1) columns完全对等 - (2) columns一致,但顺序不一致 - (3) mask columns数少于df columns数 - b. 当mask中某一列有false的时候,需要考虑类型问题:如果原类型为int/bool等,需要上升为float32,如果为float32,保持不变 - (1) mask 计算哪些列出现False,提前做列类型对齐 - c. 要求df与mask的key是一致的 - """ if df.shape[0] != other.shape[0]: raise ValueError("Row numbers should be identical.") From b643c22b9dfc2a501f6307197c2c60f2dfdd2fe1 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Fri, 11 Aug 2023 18:03:44 +0800 Subject: [PATCH 27/30] edit pipeline examples(#5008) Signed-off-by: Yu Wu --- .../lr/default_credit_config.yaml | 2 +- .../pipeline/coordinated_linr/test_linr_cv.py | 2 +- .../coordinated_linr/test_linr_warm_start.py | 6 +- .../pipeline/coordinated_lr/test_lr_cv.py | 2 +- .../coordinated_lr/test_lr_multi_class.py | 2 +- .../coordinated_lr/test_lr_multi_host.py | 2 +- .../coordinated_lr/test_lr_validate.py | 3 +- .../coordinated_lr/test_lr_warm_start.py | 6 +- .../pipeline/data_split/test_data_split.py | 2 +- .../data_split/test_data_split_stratified.py | 3 +- .../scale_testsuite.yaml | 0 .../test_scale_min_max.py | 0 .../test_scale_standard.py | 0 .../test_scale_w_lr.py | 0 .../test_feature_binning_asymmetric.py | 4 +- .../test_feature_binning_bucket.py | 4 +- .../test_feature_binning_quantile.py | 4 +- .../test_feature_selection_binning.py | 2 +- .../test_feature_selection_manual.py | 2 +- .../test_feature_selection_multi_model.py | 3 +- .../test_feature_selection_statistics.py | 2 +- examples/pipeline/multi_model/test_multi.py | 2 +- .../multi_model/test_multi_preprocessing.py | 113 ++++++++++++++++++ examples/pipeline/sample/test_sample.py | 2 +- .../pipeline/sample/test_sample_unilateral.py | 2 +- 25 files changed, 142 insertions(+), 28 deletions(-) rename examples/pipeline/{scale => feature_scale}/scale_testsuite.yaml (100%) rename examples/pipeline/{scale => feature_scale}/test_scale_min_max.py (100%) rename examples/pipeline/{scale => feature_scale}/test_scale_standard.py (100%) rename examples/pipeline/{scale => feature_scale}/test_scale_w_lr.py (100%) create mode 100644 examples/pipeline/multi_model/test_multi_preprocessing.py diff --git a/examples/benchmark_quality/lr/default_credit_config.yaml b/examples/benchmark_quality/lr/default_credit_config.yaml index dacc80dcd2..07144a2426 100644 --- a/examples/benchmark_quality/lr/default_credit_config.yaml +++ b/examples/benchmark_quality/lr/default_credit_config.yaml @@ -2,7 +2,7 @@ data_guest: "default_credit_hetero_guest" data_host: "default_credit_hetero_host" idx: "id" label_name: "y" -epochs: 20 +epochs: 30 init_param: fit_intercept: True method: "zeros" diff --git a/examples/pipeline/coordinated_linr/test_linr_cv.py b/examples/pipeline/coordinated_linr/test_linr_cv.py index ed33e0556a..082c516ab8 100644 --- a/examples/pipeline/coordinated_linr/test_linr_cv.py +++ b/examples/pipeline/coordinated_linr/test_linr_cv.py @@ -50,7 +50,7 @@ def main(config="../config.yaml", namespace=""): pipeline.add_task(psi_0) pipeline.add_task(linr_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() diff --git a/examples/pipeline/coordinated_linr/test_linr_warm_start.py b/examples/pipeline/coordinated_linr/test_linr_warm_start.py index 30f887254c..4caf3a2c20 100644 --- a/examples/pipeline/coordinated_linr/test_linr_warm_start.py +++ b/examples/pipeline/coordinated_linr/test_linr_warm_start.py @@ -76,12 +76,12 @@ def main(config="../config.yaml", namespace=""): pipeline.add_task(evaluation_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() - print(f"linr_1 model: {pipeline.get_task_info('linr_1').get_output_model()}") + # print(f"linr_1 model: {pipeline.get_task_info('linr_1').get_output_model()}") # print(f"train linr_1 data: {pipeline.get_task_info('linr_1').get_output_data()}") - print(f"linr_2 model: {pipeline.get_task_info('linr_2').get_output_model()}") + # print(f"linr_2 model: {pipeline.get_task_info('linr_2').get_output_model()}") # print(f"train linr_2 data: {pipeline.get_task_info('linr_2').get_output_data()}") diff --git a/examples/pipeline/coordinated_lr/test_lr_cv.py b/examples/pipeline/coordinated_lr/test_lr_cv.py index b981f005e1..bcd23a9b44 100644 --- a/examples/pipeline/coordinated_lr/test_lr_cv.py +++ b/examples/pipeline/coordinated_lr/test_lr_cv.py @@ -50,7 +50,7 @@ def main(config="../config.yaml", namespace=""): pipeline.add_task(psi_0) pipeline.add_task(lr_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() diff --git a/examples/pipeline/coordinated_lr/test_lr_multi_class.py b/examples/pipeline/coordinated_lr/test_lr_multi_class.py index 9ce85fe3d9..7709532ee1 100644 --- a/examples/pipeline/coordinated_lr/test_lr_multi_class.py +++ b/examples/pipeline/coordinated_lr/test_lr_multi_class.py @@ -61,7 +61,7 @@ def main(config="../config.yaml", namespace=""): pipeline.add_task(evaluation_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() pipeline.deploy([psi_0, lr_0]) diff --git a/examples/pipeline/coordinated_lr/test_lr_multi_host.py b/examples/pipeline/coordinated_lr/test_lr_multi_host.py index a94ff8afcf..1470fed40e 100644 --- a/examples/pipeline/coordinated_lr/test_lr_multi_host.py +++ b/examples/pipeline/coordinated_lr/test_lr_multi_host.py @@ -60,7 +60,7 @@ def main(config="../config.yaml", namespace=""): pipeline.add_task(evaluation_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() pipeline.deploy([psi_0, lr_0]) diff --git a/examples/pipeline/coordinated_lr/test_lr_validate.py b/examples/pipeline/coordinated_lr/test_lr_validate.py index 19c44e3903..a0d3b90179 100644 --- a/examples/pipeline/coordinated_lr/test_lr_validate.py +++ b/examples/pipeline/coordinated_lr/test_lr_validate.py @@ -62,11 +62,12 @@ def main(config="../config.yaml", namespace=""): input_data=lr_0.outputs["train_output_data"]) pipeline.add_task(psi_0) + pipeline.add_task(data_split_0) pipeline.add_task(lr_0) pipeline.add_task(evaluation_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() diff --git a/examples/pipeline/coordinated_lr/test_lr_warm_start.py b/examples/pipeline/coordinated_lr/test_lr_warm_start.py index 5e554e837f..8f12b5967a 100644 --- a/examples/pipeline/coordinated_lr/test_lr_warm_start.py +++ b/examples/pipeline/coordinated_lr/test_lr_warm_start.py @@ -76,12 +76,12 @@ def main(config="../config.yaml", namespace=""): pipeline.add_task(evaluation_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() - print(f"lr_1 model: {pipeline.get_task_info('lr_1').get_output_model()}") + # print(f"lr_1 model: {pipeline.get_task_info('lr_1').get_output_model()}") # print(f"train lr_1 data: {pipeline.get_task_info('lr_1').get_output_data()}") - print(f"lr_2 model: {pipeline.get_task_info('lr_2').get_output_model()}") + # print(f"lr_2 model: {pipeline.get_task_info('lr_2').get_output_model()}") # print(f"train lr_2 data: {pipeline.get_task_info('lr_2').get_output_data()}") diff --git a/examples/pipeline/data_split/test_data_split.py b/examples/pipeline/data_split/test_data_split.py index 484d26fa93..ee3357fb92 100644 --- a/examples/pipeline/data_split/test_data_split.py +++ b/examples/pipeline/data_split/test_data_split.py @@ -65,7 +65,7 @@ def main(config="../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() # print(pipeline.get_task_info("data_split_0").get_output_data()) diff --git a/examples/pipeline/data_split/test_data_split_stratified.py b/examples/pipeline/data_split/test_data_split_stratified.py index 647d42ad63..f01aa488dc 100644 --- a/examples/pipeline/data_split/test_data_split_stratified.py +++ b/examples/pipeline/data_split/test_data_split_stratified.py @@ -46,7 +46,6 @@ def main(config="../config.yaml", namespace=""): psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", namespace=f"experiment{namespace}")) - Linear: 0.7 data_split_0 = DataSplit("data_split_0", train_size=0.6, validate_size=0.0, @@ -68,7 +67,7 @@ def main(config="../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() # print(pipeline.get_task_info("data_split_0").get_output_data()) diff --git a/examples/pipeline/scale/scale_testsuite.yaml b/examples/pipeline/feature_scale/scale_testsuite.yaml similarity index 100% rename from examples/pipeline/scale/scale_testsuite.yaml rename to examples/pipeline/feature_scale/scale_testsuite.yaml diff --git a/examples/pipeline/scale/test_scale_min_max.py b/examples/pipeline/feature_scale/test_scale_min_max.py similarity index 100% rename from examples/pipeline/scale/test_scale_min_max.py rename to examples/pipeline/feature_scale/test_scale_min_max.py diff --git a/examples/pipeline/scale/test_scale_standard.py b/examples/pipeline/feature_scale/test_scale_standard.py similarity index 100% rename from examples/pipeline/scale/test_scale_standard.py rename to examples/pipeline/feature_scale/test_scale_standard.py diff --git a/examples/pipeline/scale/test_scale_w_lr.py b/examples/pipeline/feature_scale/test_scale_w_lr.py similarity index 100% rename from examples/pipeline/scale/test_scale_w_lr.py rename to examples/pipeline/feature_scale/test_scale_w_lr.py diff --git a/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py b/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py index 2662615184..bd48a35729 100644 --- a/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py +++ b/examples/pipeline/hetero_feature_binning/test_feature_binning_asymmetric.py @@ -59,10 +59,10 @@ def main(config="../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() - print(pipeline.get_task_info("binning_1").get_output_model()) + # print(pipeline.get_task_info("binning_1").get_output_model()) # print(pipeline.get_task_info("feature_scale_1").get_output_model()) pipeline.deploy([psi_0, binning_0]) diff --git a/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py b/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py index fae56d4dc4..34223dcc94 100644 --- a/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py +++ b/examples/pipeline/hetero_feature_binning/test_feature_binning_bucket.py @@ -63,10 +63,10 @@ def main(config="../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() - print(pipeline.get_task_info("binning_0").get_output_model()) + # print(pipeline.get_task_info("binning_0").get_output_model()) # print(pipeline.get_task_info("feature_scale_1").get_output_model()) pipeline.deploy([psi_0, binning_0]) diff --git a/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py b/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py index 727f622089..e1dc37525d 100644 --- a/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py +++ b/examples/pipeline/hetero_feature_binning/test_feature_binning_quantile.py @@ -58,10 +58,10 @@ def main(config="../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() - print(pipeline.get_task_info("binning_1").get_output_model()) + # print(pipeline.get_task_info("binning_1").get_output_model()) # print(pipeline.get_task_info("feature_scale_1").get_output_model()) pipeline.deploy([psi_0, binning_0]) diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py index 0e969544f9..d639fc63eb 100644 --- a/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_binning.py @@ -57,7 +57,7 @@ def main(config=".../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() # print(pipeline.get_task_info("feature_scale_1").get_output_model()) diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py index 722bb36c18..a278387dca 100644 --- a/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_manual.py @@ -50,7 +50,7 @@ def main(config=".../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() # print(pipeline.get_task_info("feature_scale_1").get_output_model()) diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py index b0dc8440ea..48186c182e 100644 --- a/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_multi_model.py @@ -59,11 +59,12 @@ def main(config=".../config.yaml", namespace=""): pipeline.add_task(psi_0) pipeline.add_task(binning_0) + pipeline.add_task(statistics_0) pipeline.add_task(selection_0) # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() # print(pipeline.get_task_info("feature_scale_1").get_output_model()) diff --git a/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py b/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py index bb3a3c9839..48ffe32a5c 100644 --- a/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py +++ b/examples/pipeline/hetero_feature_selection/test_feature_selection_statistics.py @@ -53,7 +53,7 @@ def main(config=".../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() # print(pipeline.get_task_info("feature_scale_1").get_output_model()) diff --git a/examples/pipeline/multi_model/test_multi.py b/examples/pipeline/multi_model/test_multi.py index 25f2a4b9d2..c069212d17 100644 --- a/examples/pipeline/multi_model/test_multi.py +++ b/examples/pipeline/multi_model/test_multi.py @@ -99,7 +99,7 @@ def main(config="../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() # print(pipeline.get_task_info("feature_scale_1").get_output_model()) diff --git a/examples/pipeline/multi_model/test_multi_preprocessing.py b/examples/pipeline/multi_model/test_multi_preprocessing.py new file mode 100644 index 0000000000..c7a9e77711 --- /dev/null +++ b/examples/pipeline/multi_model/test_multi_preprocessing.py @@ -0,0 +1,113 @@ +# +# Copyright 2019 The FATE Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from fate_client.pipeline import FateFlowPipeline +from fate_client.pipeline.components.fate import DataSplit, PSI, Sample, FeatureScale +from fate_client.pipeline.interface import DataWarehouseChannel +from fate_client.pipeline.utils import test_utils + + +def main(config="../config.yaml", namespace=""): + if isinstance(config, str): + config = test_utils.load_job_config(config) + parties = config.parties + guest = parties.guest[0] + host = parties.host[0] + + pipeline = FateFlowPipeline().set_roles(guest=guest, host=host) + if config.task_cores: + pipeline.conf.set("task_cores", config.task_cores) + if config.timeout: + pipeline.conf.set("timeout", config.timeout) + + psi_0 = PSI("psi_0") + psi_0.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_0.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + psi_1 = PSI("psi_1") + psi_1.guest.component_setting(input_data=DataWarehouseChannel(name="breast_hetero_guest", + namespace=f"experiment{namespace}")) + psi_1.hosts[0].component_setting(input_data=DataWarehouseChannel(name="breast_hetero_host", + namespace=f"experiment{namespace}")) + + data_split_0 = DataSplit("data_split_0", + train_size=0.6, + validate_size=0.0, + test_size=0.4, + stratified=True, + input_data=psi_0.outputs["output_data"]) + + data_split_1 = DataSplit("data_split_1", + train_size=200, + test_size=50, + stratified=True, + input_data=psi_0.outputs["output_data"] + ) + + sample_0 = Sample("sample_0", + frac={0: 0.5}, + replace=False, + hetero_sync=True, + input_data=psi_0.outputs["output_data"]) + + sample_1 = Sample("sample_1", + n=100, + replace=False, + hetero_sync=True, + input_data=psi_0.outputs["output_data"] + ) + feature_scale_0 = FeatureScale("feature_scale_0", + method="min_max", + feature_range={"x0": [-1, 1]}, + scale_col=["x0", "x1", "x3"], + train_data=psi_0.outputs["output_data"]) + pipeline.add_task(psi_0) + pipeline.add_task(psi_1) + pipeline.add_task(data_split_0) + pipeline.add_task(data_split_1) + pipeline.add_task(sample_0) + pipeline.add_task(sample_1) + pipeline.add_task(feature_scale_0) + + # pipeline.add_task(hetero_feature_binning_0) + pipeline.compile() + # print(pipeline.get_dag()) + pipeline.fit() + + # print(pipeline.get_task_info("data_split_0").get_output_data()) + """output_data = pipeline.get_task_info("data_split_0").get_output_data() + import pandas as pd + + print(f"data split 0 train size: {pd.DataFrame(output_data['train_output_data']).shape};" + f"validate size: {pd.DataFrame(output_data['validate_output_data']).shape}" + f"test size: {pd.DataFrame(output_data['test_output_data']).shape}") + output_data = pipeline.get_task_info("data_split_1").get_output_data() + print(f"data split 1train size: {pd.DataFrame(output_data['train_output_data']).shape};" + f"validate size: {pd.DataFrame(output_data['validate_output_data']).shape}" + f"test size: {pd.DataFrame(output_data['test_output_data']).shape}")""" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("PIPELINE DEMO") + parser.add_argument("--config", type=str, default="../config.yaml", + help="config file") + parser.add_argument("--namespace", type=str, default="", + help="namespace for data stored in FATE") + args = parser.parse_args() + main(config=args.config, namespace=args.namespace) diff --git a/examples/pipeline/sample/test_sample.py b/examples/pipeline/sample/test_sample.py index 86fbf04a97..0cbea77bbe 100644 --- a/examples/pipeline/sample/test_sample.py +++ b/examples/pipeline/sample/test_sample.py @@ -65,7 +65,7 @@ def main(config="../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() diff --git a/examples/pipeline/sample/test_sample_unilateral.py b/examples/pipeline/sample/test_sample_unilateral.py index 8bdc9b3bef..643a14e60f 100644 --- a/examples/pipeline/sample/test_sample_unilateral.py +++ b/examples/pipeline/sample/test_sample_unilateral.py @@ -66,7 +66,7 @@ def main(config="../config.yaml", namespace=""): # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) + # print(pipeline.get_dag()) pipeline.fit() From 8afbae498f6d8c9d3bf467a234629bd5c41427cf Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Mon, 14 Aug 2023 15:52:10 +0800 Subject: [PATCH 28/30] fix median & allow quantile in statistics(#4663) edit lr bq examples(#5008) Signed-off-by: Yu Wu --- .../lr/default_credit_config.yaml | 8 ++--- .../lr/default_credit_lr_sklearn_config.yaml | 2 +- .../lr/give_credit_config.yaml | 2 +- .../benchmark_quality/lr/lr_benchmark.yaml | 36 +++++++++---------- .../lr/pipeline-lr-binary.py | 1 - .../benchmark_quality/lr/pipeline-lr-multi.py | 1 - .../benchmark_quality/lr/sklearn-lr-binary.py | 2 +- .../pipeline/statistics/test_statistics.py | 4 +-- .../fate/components/components/statistics.py | 7 ++-- .../fate/components/core/params/__init__.py | 2 +- .../fate/components/core/params/_metrics.py | 24 ++++++++++++- python/fate/ml/statistics/statistics.py | 23 +++++++++--- 12 files changed, 75 insertions(+), 37 deletions(-) diff --git a/examples/benchmark_quality/lr/default_credit_config.yaml b/examples/benchmark_quality/lr/default_credit_config.yaml index 07144a2426..97d2f7c563 100644 --- a/examples/benchmark_quality/lr/default_credit_config.yaml +++ b/examples/benchmark_quality/lr/default_credit_config.yaml @@ -2,7 +2,7 @@ data_guest: "default_credit_hetero_guest" data_host: "default_credit_hetero_host" idx: "id" label_name: "y" -epochs: 30 +epochs: 16 init_param: fit_intercept: True method: "zeros" @@ -15,8 +15,8 @@ learning_rate_scheduler: optimizer: method: "rmsprop" penalty: "L2" - alpha: 0.001 + alpha: 0.01 optimizer_params: - lr: 0.17 -batch_size: 3200 + lr: 0.22 +batch_size: 2000 early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/default_credit_lr_sklearn_config.yaml b/examples/benchmark_quality/lr/default_credit_lr_sklearn_config.yaml index e1dd4f6932..73ce767d18 100644 --- a/examples/benchmark_quality/lr/default_credit_lr_sklearn_config.yaml +++ b/examples/benchmark_quality/lr/default_credit_lr_sklearn_config.yaml @@ -7,5 +7,5 @@ fit_intercept: True method: "rmsprop" penalty: "L2" eta0: 0.1 -alpha: 0.5 +alpha: 0.05 batch_size: 5000 \ No newline at end of file diff --git a/examples/benchmark_quality/lr/give_credit_config.yaml b/examples/benchmark_quality/lr/give_credit_config.yaml index f5e47fcc76..6f8656132b 100644 --- a/examples/benchmark_quality/lr/give_credit_config.yaml +++ b/examples/benchmark_quality/lr/give_credit_config.yaml @@ -17,5 +17,5 @@ optimizer: alpha: 0.01 optimizer_params: lr: 0.25 -batch_size: 5500 +batch_size: null early_stop: "diff" \ No newline at end of file diff --git a/examples/benchmark_quality/lr/lr_benchmark.yaml b/examples/benchmark_quality/lr/lr_benchmark.yaml index 63cb2603bd..1dc428bbdc 100644 --- a/examples/benchmark_quality/lr/lr_benchmark.yaml +++ b/examples/benchmark_quality/lr/lr_benchmark.yaml @@ -206,21 +206,21 @@ hetero_lr-binary-1-default-credit: # conf: "./epsilon_5k_config.yaml" # compare_setting: # relative_tol: 0.01 -hetero_lr-binary-3-give-credit: - local: - script: "./sklearn-lr-binary.py" - conf: "./give_credit_lr_sklearn_config.yaml" - FATE-hetero-lr: - script: "./pipeline-lr-binary.py" - conf: "./give_credit_config.yaml" - compare_setting: - relative_tol: 0.01 -multi-vehicle: - local: - script: "./sklearn-lr-multi.py" - conf: "./vehicle_lr_sklearn_config.yaml" - FATE-hetero-lr: - script: "./pipeline-lr-multi.py" - conf: "./vehicle_config.yaml" - compare_setting: - relative_tol: 0.01 +#hetero_lr-binary-3-give-credit: +# local: +# script: "./sklearn-lr-binary.py" +# conf: "./give_credit_lr_sklearn_config.yaml" +# FATE-hetero-lr: +# script: "./pipeline-lr-binary.py" +# conf: "./give_credit_config.yaml" +# compare_setting: +# relative_tol: 0.01 +#multi-vehicle: +# local: +# script: "./sklearn-lr-multi.py" +# conf: "./vehicle_lr_sklearn_config.yaml" +# FATE-hetero-lr: +# script: "./pipeline-lr-multi.py" +# conf: "./vehicle_config.yaml" +# compare_setting: +# relative_tol: 0.01 diff --git a/examples/benchmark_quality/lr/pipeline-lr-binary.py b/examples/benchmark_quality/lr/pipeline-lr-binary.py index 9b41bbe612..fceacd020f 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-binary.py +++ b/examples/benchmark_quality/lr/pipeline-lr-binary.py @@ -87,7 +87,6 @@ def main(config="../../config.yaml", param="./breast_config.yaml", namespace="") if config.timeout: pipeline.conf.set("timeout", config.timeout) pipeline.compile() - print(pipeline.get_dag()) pipeline.fit() lr_0_data = pipeline.get_task_info("lr_0").get_output_data()["train_output_data"] diff --git a/examples/benchmark_quality/lr/pipeline-lr-multi.py b/examples/benchmark_quality/lr/pipeline-lr-multi.py index 463d3cc91a..aff7c32a36 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-multi.py +++ b/examples/benchmark_quality/lr/pipeline-lr-multi.py @@ -85,7 +85,6 @@ def main(config="../../config.yaml", param="./vehicle_config.yaml", namespace="" pipeline.conf.set("timeout", config.timeout) pipeline.compile() - print(pipeline.get_dag()) pipeline.fit() lr_0_data = pipeline.get_component("lr_0").get_output_data()["train_output_data"] diff --git a/examples/benchmark_quality/lr/sklearn-lr-binary.py b/examples/benchmark_quality/lr/sklearn-lr-binary.py index 2a2710be2f..058b2d79fc 100644 --- a/examples/benchmark_quality/lr/sklearn-lr-binary.py +++ b/examples/benchmark_quality/lr/sklearn-lr-binary.py @@ -76,7 +76,7 @@ def main(config="../../config.yaml", param="./breast_lr_sklearn_config.yaml"): fpr, tpr, thresholds = roc_curve(y_test, y_prob) ks = max(tpr - fpr) - result = {"auc": auc_score, "recall": recall, "binary_precision": pr, "accuracy": acc} + result = {"auc": auc_score, "recall": recall, "precision": pr, "accuracy": acc} print(result) print(f"coef_: {lm_fit.coef_}, intercept_: {lm_fit.intercept_}, n_iter: {lm_fit.n_iter_}") return {}, result diff --git a/examples/pipeline/statistics/test_statistics.py b/examples/pipeline/statistics/test_statistics.py index e5e7605856..2f7bb18051 100644 --- a/examples/pipeline/statistics/test_statistics.py +++ b/examples/pipeline/statistics/test_statistics.py @@ -40,15 +40,15 @@ def main(config=".../config.yaml", namespace=""): namespace=f"experiment{namespace}")) statistics_0 = Statistics("statistics_0", input_data=psi_0.outputs["output_data"], - metrics=["mean", "std", "min", "max"]) + metrics=["mean", "std", "min", "max", "25%", "median", "75%"]) pipeline.add_task(psi_0) pipeline.add_task(statistics_0) # pipeline.add_task(hetero_feature_binning_0) pipeline.compile() - print(pipeline.get_dag()) pipeline.fit() + # print(f"statistics_0 output model: {pipeline.get_task_info('statistics_0').get_output_model()}") if __name__ == "__main__": diff --git a/python/fate/components/components/statistics.py b/python/fate/components/components/statistics.py index 2bf3661a75..5224b01609 100644 --- a/python/fate/components/components/statistics.py +++ b/python/fate/components/components/statistics.py @@ -25,7 +25,8 @@ def statistics( role: Role, input_data: cpn.dataframe_input(roles=[GUEST, HOST]), metrics: cpn.parameter( - type=Union[List[params.statistic_metrics_param()], params.statistic_metrics_param()], + type=Union[List[Union[params.statistic_metrics_param(), params.legal_percentile()]], + params.statistic_metrics_param(), params.legal_percentile()], default=["mean", "std", "min", "max"], desc="metrics to be computed, default ['count', 'mean', 'std', 'min', 'max']", ), @@ -37,6 +38,8 @@ def statistics( default=True, desc="If False, the calculations of skewness and kurtosis are corrected for statistical bias.", ), + relative_error: cpn.parameter(type=params.confloat(gt=0, le=1), default=1e-3, + desc="float, error rate for quantile"), skip_col: cpn.parameter( type=List[str], default=None, @@ -60,7 +63,7 @@ def statistics( for metric in metrics: if metric == "describe": raise ValueError(f"'describe' should not be combined with additional metric names.") - stat_computer = FeatureStatistics(list(set(metrics)), ddof, bias) + stat_computer = FeatureStatistics(list(set(metrics)), ddof, bias, relative_error) input_data = input_data[select_cols] stat_computer.fit(sub_ctx, input_data) diff --git a/python/fate/components/core/params/__init__.py b/python/fate/components/core/params/__init__.py index 40b0d629ad..4e9fdf8bfc 100644 --- a/python/fate/components/core/params/__init__.py +++ b/python/fate/components/core/params/__init__.py @@ -27,6 +27,6 @@ ) from ._init_param import InitParam, init_param from ._learning_rate import LRSchedulerParam, lr_scheduler_param -from ._metrics import metrics_param, statistic_metrics_param +from ._metrics import metrics_param, statistic_metrics_param, legal_percentile from ._optimizer import OptimizerParam, optimizer_param from ._penalty import penalty_param diff --git a/python/fate/components/core/params/_metrics.py b/python/fate/components/core/params/_metrics.py index c911fc0707..e336e1da2c 100644 --- a/python/fate/components/core/params/_metrics.py +++ b/python/fate/components/core/params/_metrics.py @@ -13,9 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re from typing import Type -from ._fields import StringChoice +from ._fields import StringChoice, Parameter class Metrics(StringChoice): @@ -68,3 +69,24 @@ def metrics_param(auc=True, ks=True, accuracy=True, mse=True) -> Type[str]: choice={k for k, v in choice.items() if v}, ) return type("Metrics", (Metrics,), namespace) + + +class LegalPercentile(str, Parameter): + legal_percentile = r"^(100)|(?:[05]|[0-9]?[05])0*%$" + + @classmethod + def __get_validators__(cls): + yield cls.percentile_validator + + @classmethod + def percentile_validator(cls, v): + if re.match(cls.legal_percentile, v): + return v + raise ValueError(f"provided `{v}` not in legal percentile format") + + +def legal_percentile() -> Type[str]: + namespace = dict( + legal_percentile=LegalPercentile.legal_percentile, + ) + return type("LegalPercentile", (LegalPercentile,), namespace) diff --git a/python/fate/ml/statistics/statistics.py b/python/fate/ml/statistics/statistics.py index 7015756dd6..232d8d6ba2 100644 --- a/python/fate/ml/statistics/statistics.py +++ b/python/fate/ml/statistics/statistics.py @@ -14,6 +14,7 @@ # limitations under the License. import logging +import re from typing import List import pandas as pd @@ -25,9 +26,9 @@ class FeatureStatistics(Module): - def __init__(self, metrics: List[str] = None, ddof=1, bias=True): + def __init__(self, metrics: List[str] = None, ddof=1, bias=True, relative_error=1e-3): self.metrics = metrics - self.summary = StatisticsSummary(ddof, bias) + self.summary = StatisticsSummary(ddof, bias, relative_error) def fit(self, ctx: Context, input_data, validate_data=None) -> None: self.summary.compute_metrics(input_data, self.metrics) @@ -49,7 +50,7 @@ def from_model(cls, model) -> "FeatureStatistics": class StatisticsSummary(Module): - def __init__(self, ddof=1, bias=True): + def __init__(self, ddof=1, bias=True, relative_error=1e-3): """if metrics is not None: if len(metrics) == 1 and metrics[0] == "describe": self.inner_metric_names = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] @@ -57,20 +58,31 @@ def __init__(self, ddof=1, bias=True): self.inner_metric_names = metrics""" self.ddof = ddof self.bias = bias + self.relative_error = relative_error self.inner_metric_names = [] self.metrics_summary = None self._count = None self._nan_count = None self._mean = None self._describe = None + self._quantile = None + self._q_pts = None def get_from_describe(self, data, metric): if self._describe is None: self._describe = data.describe(ddof=self.ddof, unbiased=~self.bias) return self._describe[metric] + def get_from_quantile_summary(self, data, metric): + query_q = int(metric[:-1]) / 100 + if self._quantile is None: + self._quantile = data.quantile(q=self._q_pts, relative_error=self.relative_error) + return self._quantile.loc[query_q] + def compute_metrics(self, data, metrics): res = pd.DataFrame(columns=data.schema.columns) + q_metrics = [metric for metric in metrics if re.match(r"^(100|\d{1,2})%$", metric)] + self._q_pts = [int(metric[:-1]) / 100 for metric in q_metrics] for metric in metrics: metric_val = None """if metric == "describe": @@ -80,12 +92,15 @@ def compute_metrics(self, data, metrics): return""" if metric in ["sum", "min", "max", "mean", "std", "var"]: metric_val = self.get_from_describe(data, metric) + if metric in q_metrics: + metric_val = self.get_from_quantile_summary(data, metric) elif metric == "count": if self._count is None: self._count = data.count() metric_val = self._count elif metric == "median": - metric_val = data.median() + metric_val = data.quantile(q=0.5, relative_error=self.relative_error) + metric_val = metric_val.loc[0.5] elif metric == "coefficient_of_variation": metric_val = self.get_from_describe(data, "variation") elif metric == "missing_count": From c1f4b498bec1bd7bd3b98abf0b5c5697c997f127 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Mon, 14 Aug 2023 17:55:20 +0800 Subject: [PATCH 29/30] batch loader use sorted indexer for default edit bq examples(#5008) Signed-off-by: Yu Wu --- .../lr/breast_lr_sklearn_config.yaml | 2 +- .../benchmark_quality/lr/lr_benchmark.yaml | 54 +++++++++---------- .../fate/arch/dataframe/utils/_dataloader.py | 2 +- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml b/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml index 2993795c78..e7fc0c17d4 100644 --- a/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml +++ b/examples/benchmark_quality/lr/breast_lr_sklearn_config.yaml @@ -7,5 +7,5 @@ fit_intercept: True method: "rmsprop" penalty: "L2" eta0: 0.1 -alpha: 0.5 +alpha: 0.05 batch_size: 5000 \ No newline at end of file diff --git a/examples/benchmark_quality/lr/lr_benchmark.yaml b/examples/benchmark_quality/lr/lr_benchmark.yaml index 1dc428bbdc..a26fa9a757 100644 --- a/examples/benchmark_quality/lr/lr_benchmark.yaml +++ b/examples/benchmark_quality/lr/lr_benchmark.yaml @@ -179,15 +179,15 @@ data: table_name: vehicle_scale_hetero_host namespace: experiment role: host_0 -#hetero_lr-binary-0-breast: -# local: -# script: "./sklearn-lr-binary.py" -# conf: "./breast_lr_sklearn_config.yaml" -# FATE-hetero-lr: -# script: "./pipeline-lr-binary.py" -# conf: "./breast_config.yaml" -# compare_setting: -# relative_tol: 0.01 +hetero_lr-binary-0-breast: + local: + script: "./sklearn-lr-binary.py" + conf: "./breast_lr_sklearn_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-binary.py" + conf: "./breast_config.yaml" + compare_setting: + relative_tol: 0.01 hetero_lr-binary-1-default-credit: local: script: "./sklearn-lr-binary.py" @@ -197,24 +197,24 @@ hetero_lr-binary-1-default-credit: conf: "./default_credit_config.yaml" compare_setting: relative_tol: 0.01 -#hetero_lr-binary-2-epsilon-5k: -# local: -# script: "./sklearn-lr-binary.py" -# conf: "./epsilon_5k_lr_sklearn_config.yaml" -# FATE-hetero-lr: -# script: "./pipeline-lr-binary.py" -# conf: "./epsilon_5k_config.yaml" -# compare_setting: -# relative_tol: 0.01 -#hetero_lr-binary-3-give-credit: -# local: -# script: "./sklearn-lr-binary.py" -# conf: "./give_credit_lr_sklearn_config.yaml" -# FATE-hetero-lr: -# script: "./pipeline-lr-binary.py" -# conf: "./give_credit_config.yaml" -# compare_setting: -# relative_tol: 0.01 +hetero_lr-binary-2-epsilon-5k: + local: + script: "./sklearn-lr-binary.py" + conf: "./epsilon_5k_lr_sklearn_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-binary.py" + conf: "./epsilon_5k_config.yaml" + compare_setting: + relative_tol: 0.01 +hetero_lr-binary-3-give-credit: + local: + script: "./sklearn-lr-binary.py" + conf: "./give_credit_lr_sklearn_config.yaml" + FATE-hetero-lr: + script: "./pipeline-lr-binary.py" + conf: "./give_credit_config.yaml" + compare_setting: + relative_tol: 0.01 #multi-vehicle: # local: # script: "./sklearn-lr-multi.py" diff --git a/python/fate/arch/dataframe/utils/_dataloader.py b/python/fate/arch/dataframe/utils/_dataloader.py index d984dcf92f..f22fd3893a 100644 --- a/python/fate/arch/dataframe/utils/_dataloader.py +++ b/python/fate/arch/dataframe/utils/_dataloader.py @@ -124,7 +124,7 @@ def _prepare(self): indexer = sorted(list(self._dataset.get_indexer(target="sample_id").collect())) if self._shuffle: random.seed = self._random_state - random.shuffle(indexer) + random.shuffle(indexer) for i, iter_ctx in self._ctx.sub_ctx("dataloader_batch").ctxs_range(self._batch_num): batch_indexer = indexer[self._batch_size * i: self._batch_size * (i + 1)] From 65e8a859ae22ec649bc088b58180eb3a189f5263 Mon Sep 17 00:00:00 2001 From: Yu Wu Date: Mon, 14 Aug 2023 18:25:05 +0800 Subject: [PATCH 30/30] edit bq examples(#5008) Signed-off-by: Yu Wu --- examples/benchmark_quality/lr/pipeline-lr-multi.py | 1 + examples/benchmark_quality/lr/sklearn-lr-binary.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/benchmark_quality/lr/pipeline-lr-multi.py b/examples/benchmark_quality/lr/pipeline-lr-multi.py index aff7c32a36..b5401c1122 100644 --- a/examples/benchmark_quality/lr/pipeline-lr-multi.py +++ b/examples/benchmark_quality/lr/pipeline-lr-multi.py @@ -74,6 +74,7 @@ def main(config="../../config.yaml", param="./vehicle_config.yaml", namespace="" runtime_roles=['guest'], label_column_name=param.get("label_name"), input_data=lr_0.outputs["train_output_data"], + predict_column_name='predict_result', metrics=['multi_recall', 'multi_accuracy', 'multi_precision']) pipeline.add_task(psi_0) pipeline.add_task(lr_0) diff --git a/examples/benchmark_quality/lr/sklearn-lr-binary.py b/examples/benchmark_quality/lr/sklearn-lr-binary.py index 058b2d79fc..51e463df94 100644 --- a/examples/benchmark_quality/lr/sklearn-lr-binary.py +++ b/examples/benchmark_quality/lr/sklearn-lr-binary.py @@ -78,7 +78,7 @@ def main(config="../../config.yaml", param="./breast_lr_sklearn_config.yaml"): ks = max(tpr - fpr) result = {"auc": auc_score, "recall": recall, "precision": pr, "accuracy": acc} print(result) - print(f"coef_: {lm_fit.coef_}, intercept_: {lm_fit.intercept_}, n_iter: {lm_fit.n_iter_}") + # print(f"coef_: {lm_fit.coef_}, intercept_: {lm_fit.intercept_}, n_iter: {lm_fit.n_iter_}") return {}, result