From 468e0b66f558086f6a5c1843bf66e0b12375ff4b Mon Sep 17 00:00:00 2001 From: Holger Roth <6304754+holgerroth@users.noreply.github.com> Date: Thu, 16 Feb 2023 15:41:19 -0500 Subject: [PATCH] Cifar10 split learning (#1168) run training with higher timeout value rename classes add training accuracy remove printouts add figure update printouts update readme vertical split data run psi add notebook update notebook update notebook take intersection.txt as input for split-learning configure overlap add todo refactor to use FCI update requirements formatting add validation unify gitignore revert network; remove unnecessary check; use stats pool for computation time introduce cifar10 data utils move splitnn example to vertical_federated_learning move more files deleted moved files move to tutorials address comments --- .../{cifar10-real-world => }/.gitignore | 0 examples/cifar10/cifar10-sim/.gitignore | 23 - .../cifar10/pt/utils/cifar10_data_splitter.py | 32 +- .../cifar10/pt/utils/cifar10_data_utils.py | 62 + .../vertical_federated_learning/.gitignore | 11 + .../vertical_federated_learning/README.md | 7 + .../cifar10-splitnn/README.md | 43 + .../cifar10_split_data_vertical.py | 46 + .../cifar10_split_learning.ipynb | 1493 +++++++++++++++++ .../cifar10-splitnn/figs/split_learning.svg | 1 + .../job_configs/cifar10_psi/meta.json | 9 + .../server/config/config_fed_server.json | 11 + .../site-1/config/config_fed_client.json | 34 + .../site-2/config/config_fed_client.json | 34 + .../job_configs/cifar10_splitnn/meta.json | 9 + .../server/config/config_fed_server.json | 47 + .../site-1/config/config_fed_client.json | 40 + .../site-2/config/config_fed_client.json | 40 + .../cifar10-splitnn/set_intersection_file.py | 46 + .../src/psi/cifar10_local_psi.py | 36 + .../src/splitnn/cifar10_learner_splitnn.py | 614 +++++++ .../src/splitnn/cifar10_splitnn_dataset.py | 90 + .../splitnn/cifar10_vertical_data_splitter.py | 92 + .../cifar10-splitnn/src/splitnn/split_nn.py | 38 + .../virtualenv/min-requirements.txt | 7 + .../cifar10-splitnn/virtualenv/set_env.sh | 6 + .../executors/splitnn_learner_executor.py | 92 + .../app_common/workflows/splitnn_workflow.py | 286 ++++ nvflare/app_opt/pt/decomposers.py | 38 + 29 files changed, 3238 insertions(+), 49 deletions(-) rename examples/cifar10/{cifar10-real-world => }/.gitignore (100%) delete mode 100644 examples/cifar10/cifar10-sim/.gitignore create mode 100644 examples/cifar10/pt/utils/cifar10_data_utils.py create mode 100644 examples/tutorial/vertical_federated_learning/.gitignore create mode 100644 examples/tutorial/vertical_federated_learning/README.md create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/README.md create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/cifar10_split_data_vertical.py create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/cifar10_split_learning.ipynb create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/figs/split_learning.svg create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/meta.json create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/server/config/config_fed_server.json create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/site-1/config/config_fed_client.json create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/site-2/config/config_fed_client.json create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/meta.json create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/server/config/config_fed_server.json create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/site-1/config/config_fed_client.json create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/site-2/config/config_fed_client.json create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/set_intersection_file.py create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/psi/cifar10_local_psi.py create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_learner_splitnn.py create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_splitnn_dataset.py create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_vertical_data_splitter.py create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/split_nn.py create mode 100644 examples/tutorial/vertical_federated_learning/cifar10-splitnn/virtualenv/min-requirements.txt create mode 100755 examples/tutorial/vertical_federated_learning/cifar10-splitnn/virtualenv/set_env.sh create mode 100644 nvflare/app_common/executors/splitnn_learner_executor.py create mode 100644 nvflare/app_common/workflows/splitnn_workflow.py create mode 100644 nvflare/app_opt/pt/decomposers.py diff --git a/examples/cifar10/cifar10-real-world/.gitignore b/examples/cifar10/.gitignore similarity index 100% rename from examples/cifar10/cifar10-real-world/.gitignore rename to examples/cifar10/.gitignore diff --git a/examples/cifar10/cifar10-sim/.gitignore b/examples/cifar10/cifar10-sim/.gitignore deleted file mode 100644 index 10c10bf11b..0000000000 --- a/examples/cifar10/cifar10-sim/.gitignore +++ /dev/null @@ -1,23 +0,0 @@ -# ide -.idea/ -.ipynb_checkpoints/ - -# nvflare artifacts -log.txt -client_token.txt -*.fl -audit.log -transfer -workspaces - -# python -__pycache__ -.pyc - -# virtual environments -nvflare_cifar10 - -# data -dataset -dataset* -*results* diff --git a/examples/cifar10/pt/utils/cifar10_data_splitter.py b/examples/cifar10/pt/utils/cifar10_data_splitter.py index 1ca8f19171..4fc3bbc7ce 100644 --- a/examples/cifar10/pt/utils/cifar10_data_splitter.py +++ b/examples/cifar10/pt/utils/cifar10_data_splitter.py @@ -41,24 +41,12 @@ import os import numpy as np -import torchvision.datasets as datasets +from cifar10_data_utils import get_site_class_summary, load_cifar10_data from nvflare.apis.event_type import EventType from nvflare.apis.fl_component import FLComponent from nvflare.apis.fl_context import FLContext -CIFAR10_ROOT = "/tmp/cifar10" # will be used for all CIFAR-10 experiments - - -def _get_site_class_summary(train_label, site_idx): - class_sum = {} - - for site, data_idx in site_idx.items(): - unq, unq_cnt = np.unique(train_label[data_idx], return_counts=True) - tmp = {int(unq[i]): int(unq_cnt[i]) for i in range(len(unq))} - class_sum[site] = tmp - return class_sum - class Cifar10DataSplitter(FLComponent): def __init__(self, split_dir: str = None, num_sites: int = 8, alpha: float = 0.5, seed: int = 0): @@ -68,8 +56,10 @@ def __init__(self, split_dir: str = None, num_sites: int = 8, alpha: float = 0.5 self.alpha = alpha self.seed = seed + if self.split_dir is None: + raise ValueError("You need to define a valid `split_dir` when splitting the data.") if alpha < 0.0: - raise ValueError(f"Alpha should be larger 0.0 but was {alpha}!") + raise ValueError(f"Alpha should be larger or equal 0.0 but was" f" {alpha}!") def handle_event(self, event_type: str, fl_ctx: FLContext): if event_type == EventType.START_RUN: @@ -85,8 +75,6 @@ def split(self, fl_ctx: FLContext): site_idx, class_sum = self._partition_data() # write to files - if self.split_dir is None: - raise ValueError("You need to define a valid `split_dir` when splitting the data.") if not os.path.isdir(self.split_dir): os.makedirs(self.split_dir) sum_file_name = os.path.join(self.split_dir, "summary.txt") @@ -101,16 +89,8 @@ def split(self, fl_ctx: FLContext): site_file_name = site_file_path + str(site + 1) + ".npy" np.save(site_file_name, np.array(site_idx[site])) - def load_cifar10_data(self): - # download data - train_dataset = datasets.CIFAR10(root=CIFAR10_ROOT, train=True, download=True) - - # only training label is needed for doing split - train_label = np.array(train_dataset.targets) - return train_label - def _partition_data(self): - train_label = self.load_cifar10_data() + train_label = load_cifar10_data() min_size = 0 K = 10 @@ -140,6 +120,6 @@ def _partition_data(self): site_idx[j] = idx_batch[j] # collect class summary - class_sum = _get_site_class_summary(train_label, site_idx) + class_sum = get_site_class_summary(train_label, site_idx) return site_idx, class_sum diff --git a/examples/cifar10/pt/utils/cifar10_data_utils.py b/examples/cifar10/pt/utils/cifar10_data_utils.py new file mode 100644 index 0000000000..16e8201516 --- /dev/null +++ b/examples/cifar10/pt/utils/cifar10_data_utils.py @@ -0,0 +1,62 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This Dirichlet sampling strategy for creating a heterogeneous partition is adopted +# from FedMA (https://github.com/IBM/FedMA). + +# MIT License + +# Copyright (c) 2020 International Business Machines + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import numpy as np +import torchvision.datasets as datasets + +CIFAR10_ROOT = "/tmp/cifar10" # will be used for all CIFAR-10 experiments + + +def load_cifar10_data(): + # download data + train_dataset = datasets.CIFAR10(root=CIFAR10_ROOT, train=True, download=True) + + # only training label is needed for doing split + train_label = np.array(train_dataset.targets) + return train_label + + +def get_site_class_summary(train_label, site_idx): + class_sum = {} + + for site, data_idx in site_idx.items(): + unq, unq_cnt = np.unique(train_label[data_idx], return_counts=True) + tmp = {int(unq[i]): int(unq_cnt[i]) for i in range(len(unq))} + class_sum[site] = tmp + return class_sum diff --git a/examples/tutorial/vertical_federated_learning/.gitignore b/examples/tutorial/vertical_federated_learning/.gitignore new file mode 100644 index 0000000000..53bb793f2d --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/.gitignore @@ -0,0 +1,11 @@ +# ide +.idea/ +.ipynb_checkpoints/ + +# python +__pycache__ +.pyc + +# virtual environments +nvflare_cifar10 + diff --git a/examples/tutorial/vertical_federated_learning/README.md b/examples/tutorial/vertical_federated_learning/README.md new file mode 100644 index 0000000000..7a93372012 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/README.md @@ -0,0 +1,7 @@ +# Vertical Federated Learning + +## Split Learning +### Split learning with CIFAR-10 +This [example](./cifar10-splitnn/README.md) includes instructions on how to run +[split learning](https://arxiv.org/abs/1810.06060) using the CIFAR-10 dataset +and the FL simulator. diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/README.md b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/README.md new file mode 100644 index 0000000000..712fb58af5 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/README.md @@ -0,0 +1,43 @@ +# Split Learning with CIFAR-10 + +This example includes instructions on how to run [split learning](https://arxiv.org/abs/1810.06060) (SL) using the CIFAR-10 dataset and the FL simulator. + +We assume one client holds the images, and the other clients holds the labels to compute losses and accuracy metrics. +Activations and corresponding gradients are being exchanged between the clients through the NVFlare server. + +Split learning setup + +For instructions of how to run CIFAR-10 in real-world deployment settings, +see the example on ["Real-world Federated Learning with CIFAR-10"](../cifar10-real-world/README.md). + +## (Optional) Set up a virtual environment +``` +python3 -m pip install --user --upgrade pip +python3 -m pip install --user virtualenv +``` +(If needed) make all shell scripts executable using +``` +find . -name ".sh" -exec chmod +x {} \; +``` +initialize virtual environment. +``` +source ./virtualenv/set_env.sh +``` +install required packages for training +``` +pip install --upgrade pip +pip install -r ./virtualenv/min-requirements.txt +``` + +## Start Jupyter notebook +Set `PYTHONPATH` to include custom files of this example and some reused files from the [CIFAR-10](../cifar10) examples: +``` +export PYTHONPATH=${PWD}/src:${PWD}/../../../cifar10 +``` +Start a Jupyter Lab +``` +jupyter lab . +``` +and open [cifar10_split_learning.ipynb](./cifar10_split_learning.ipynb). + +See [here](https://jupyterlab.readthedocs.io/en/stable/getting_started/installation.html) for installing Jupyter Lab. diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/cifar10_split_data_vertical.py b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/cifar10_split_data_vertical.py new file mode 100644 index 0000000000..ea51c472db --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/cifar10_split_data_vertical.py @@ -0,0 +1,46 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import sys + +from splitnn.cifar10_vertical_data_splitter import Cifar10VerticalDataSplitter + +from nvflare.apis.fl_context import FLContext + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +import argparse + +from nvflare.apis.fl_constant import ReservedKey + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--split_dir", type=str, default="/tmp/cifar10_vert_splits", help="output folder") + parser.add_argument("--overlap", type=int, default=10_000, help="number of overlapping samples") + args = parser.parse_args() + + splitter = Cifar10VerticalDataSplitter(split_dir=args.split_dir, overlap=args.overlap) + + # set up a dummy context for logging + fl_ctx = FLContext() + fl_ctx.set_prop(ReservedKey.IDENTITY_NAME, "local") + fl_ctx.set_prop(ReservedKey.RUN_NUM, "_") + + splitter.split(fl_ctx) # will download to CIFAR10_ROOT defined in + # Cifar10DataSplitter + + +if __name__ == "__main__": + main() diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/cifar10_split_learning.ipynb b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/cifar10_split_learning.ipynb new file mode 100644 index 0000000000..7cddf3a525 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/cifar10_split_learning.ipynb @@ -0,0 +1,1493 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cada310b-e776-4b9a-aabe-f111c31efcc2", + "metadata": {}, + "source": [ + "# Split Learning with CIFAR-10" + ] + }, + { + "cell_type": "markdown", + "id": "0653cbf2-92f2-4a22-8317-69cfb0266e92", + "metadata": {}, + "source": [ + "## 1. Download and split the CIFAR-10 dataset\n", + "To simulate a vertical split dataset, we first download the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset and distribute it between the two clients." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f4130b15-09e6-456f-a3c7-87c8ee9e07f0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: SPLIT_DIR=/tmp/cifar10_vert_splits\n", + "env: OVERLAP=10000\n", + "INFO:Cifar10VerticalDataSplitter:[identity=local, run=_]: Partition CIFAR-10 dataset into vertically with 10000 overlapping samples.\n", + "Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /tmp/cifar10/cifar-10-python.tar.gz\n", + "100.0%\n", + "Extracting /tmp/cifar10/cifar-10-python.tar.gz to /tmp/cifar10\n", + "INFO:Cifar10VerticalDataSplitter:[identity=local, run=_]: save /tmp/cifar10_vert_splits/overlap.npy\n", + "INFO:Cifar10VerticalDataSplitter:[identity=local, run=_]: save /tmp/cifar10_vert_splits/site-1.npy\n", + "INFO:Cifar10VerticalDataSplitter:[identity=local, run=_]: save /tmp/cifar10_vert_splits/site-2.npy\n" + ] + } + ], + "source": [ + "%env SPLIT_DIR=/tmp/cifar10_vert_splits\n", + "%env OVERLAP=10000\n", + "!python3 ./cifar10_split_data_vertical.py --split_dir ${SPLIT_DIR} --overlap ${OVERLAP}" + ] + }, + { + "cell_type": "markdown", + "id": "af257e69-2bb7-49b6-ac6c-f007b0e6618e", + "metadata": {}, + "source": [ + "## 2. Run private set intersection\n", + "We are using NVFlare's FL simulator to run the following experiments.\n", + "\n", + "In order to find the overlapping data indices between the different clients participating in split learning, \n", + "we randomly select an subset of the training indices." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fdb7290a-48ff-4e80-be58-5e6b0e0f9379", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-02-13 19:33:37,341 - SimulatorRunner - INFO - Create the Simulator Server.\n", + "2023-02-13 19:33:37,447 - nvflare.fuel.hci.server.hci - INFO - Starting Admin Server localhost on Port 43373\n", + "2023-02-13 19:33:37,453 - SimulatorServer - INFO - starting insecure server at localhost:36789\n", + "2023-02-13 19:33:37,456 - SimulatorRunner - INFO - Deploy the Apps.\n", + "2023-02-13 19:33:37,459 - SimulatorRunner - INFO - Create the simulate clients.\n", + "2023-02-13 19:33:37,506 - ClientManager - INFO - Client: New client site-1@127.0.0.1 joined. Sent token: 58824fe4-990b-4ae1-8355-272c98e8a74b. Total clients: 1\n", + "2023-02-13 19:33:37,509 - FederatedClient - INFO - Successfully registered client:site-1 for project simulator_server. Token:58824fe4-990b-4ae1-8355-272c98e8a74b SSID:\n", + "2023-02-13 19:33:37,551 - ClientManager - INFO - Client: New client site-2@127.0.0.1 joined. Sent token: 429e493d-7565-45e5-a7fa-a9021decb438. Total clients: 2\n", + "2023-02-13 19:33:37,555 - FederatedClient - INFO - Successfully registered client:site-2 for project simulator_server. Token:429e493d-7565-45e5-a7fa-a9021decb438 SSID:\n", + "2023-02-13 19:33:37,559 - SimulatorRunner - INFO - Set the client status ready.\n", + "2023-02-13 19:33:37,562 - SimulatorRunner - INFO - Deploy and start the Server App.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/__init__.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_ditto.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_fed_utils.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_fedopt.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_fedproxloss.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_file_model_locator.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_file_model_persistor.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_model_reader_writer.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_multi_process_executor.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_scaffold.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/tb_receiver.py is deprecated. Please use nvflare.app_opt.tracking.tensorboard.tb_receiver.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-02-13 19:33:38,659 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: Server runner starting ...\n", + "2023-02-13 19:33:38,661 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: starting workflow DhPSIController () ...\n", + "2023-02-13 19:33:38,663 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: Workflow DhPSIController () started\n", + "2023-02-13 19:33:38,664 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: PSI control flow started.\n", + "2023-02-13 19:33:38,665 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: start pre workflow\n", + "2023-02-13 19:33:38,666 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: pre_process on task PSI\n", + "2023-02-13 19:33:38,668 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: scheduled task PSI\n", + "2023-02-13 19:33:39,580 - SimulatorClientRunner - INFO - Start the clients run simulation.\n", + "2023-02-13 19:33:40,584 - SimulatorClientRunner - INFO - Simulate Run client: site-1\n", + "2023-02-13 19:33:40,588 - SimulatorClientRunner - INFO - Simulate Run client: site-2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "E0213 19:33:40.609877062 72784 fork_posix.cc:76] Other threads are currently calling into gRPC, skipping fork() handlers\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/__init__.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_ditto.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_fed_utils.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_fedopt.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_fedproxloss.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_file_model_locator.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_file_model_persistor.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_model_reader_writer.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_multi_process_executor.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_scaffold.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/tb_receiver.py is deprecated. Please use nvflare.app_opt.tracking.tensorboard.tb_receiver.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/__init__.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_ditto.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_fed_utils.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_fedopt.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_fedproxloss.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_file_model_locator.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_file_model_persistor.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_model_reader_writer.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_multi_process_executor.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/pt_scaffold.py is deprecated. Please use nvflare.app_opt.pt.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + "/usr/lib/python3.10/importlib/__init__.py:126: FutureWarning: This module: /home/hroth/Code2/nvflare/splitnn_dev/nvflare/app_common/pt/tb_receiver.py is deprecated. Please use nvflare.app_opt.tracking.tensorboard.tb_receiver.\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-02-13 19:33:42,763 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: assigned task to client site-1: name=PSI, id=d9c01c72-8954-4ac0-9af4-de37876e7d85\n", + "2023-02-13 19:33:42,770 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: sent task assignment to client\n", + "2023-02-13 19:33:42,772 - SimulatorServer - INFO - GetTask: Return task: PSI to client: site-1 (58824fe4-990b-4ae1-8355-272c98e8a74b) \n", + "2023-02-13 19:33:42,774 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: assigned task to client site-2: name=PSI, id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568\n", + "2023-02-13 19:33:42,777 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: sent task assignment to client\n", + "2023-02-13 19:33:42,780 - SimulatorServer - INFO - GetTask: Return task: PSI to client: site-2 (429e493d-7565-45e5-a7fa-a9021decb438) \n", + "2023-02-13 19:33:42,892 - SimulatorServer - INFO - received update from simulator_server_site-2_0 (986 Bytes, 1676334822 seconds)\n", + "2023-02-13 19:33:42,894 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job]: got result from client site-2 for task: name=PSI, id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568\n", + "2023-02-13 19:33:42,897 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: invoking result_received_cb ...\n", + "task_name PSI\n", + "2023-02-13 19:33:42,905 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: Processing PSI, None result from client site-2\n", + "2023-02-13 19:33:42,707 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: client runner started\n", + "2023-02-13 19:33:42,707 - ClientTaskWorker - INFO - Initialize ClientRunner for client: site-1\n", + "2023-02-13 19:33:42,719 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: client runner started\n", + "2023-02-13 19:33:42,719 - ClientTaskWorker - INFO - Initialize ClientRunner for client: site-2\n", + "2023-02-13 19:33:42,775 - Communicator - INFO - Received from simulator_server server (590 Bytes). getTask time: 0.0653986930847168 seconds\n", + "2023-02-13 19:33:42,776 - FederatedClient - INFO - pull_task completed. Task name:PSI Status:True \n", + "2023-02-13 19:33:42,777 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=PSI, id=d9c01c72-8954-4ac0-9af4-de37876e7d85\n", + "2023-02-13 19:33:42,777 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: invoking task executor \n", + "2023-02-13 19:33:42,778 - DhPSIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: Executing task 'PSI' for site-1\n", + "2023-02-13 19:33:42,778 - DhPSIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: Executing psi_stage_task PSI_PREPARE for site-1\n", + "2023-02-13 19:33:42,783 - Communicator - INFO - Received from simulator_server server (590 Bytes). getTask time: 0.06263232231140137 seconds\n", + "2023-02-13 19:33:42,785 - FederatedClient - INFO - pull_task completed. Task name:PSI Status:True \n", + "2023-02-13 19:33:42,786 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=PSI, id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568\n", + "2023-02-13 19:33:42,787 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: invoking task executor \n", + "2023-02-13 19:33:42,787 - DhPSIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: Executing task 'PSI' for site-2\n", + "2023-02-13 19:33:42,788 - DhPSIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: Executing psi_stage_task PSI_PREPARE for site-2\n", + "2023-02-13 19:33:42,857 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: finished processing task\n", + "2023-02-13 19:33:42,859 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-13 19:33:42,860 - Communicator - INFO - Send submitUpdate to simulator_server server\n", + "2023-02-13 19:33:42,880 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: finished processing task\n", + "2023-02-13 19:33:42,882 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-13 19:33:42,882 - Communicator - INFO - Send submitUpdate to simulator_server server\n", + "2023-02-13 19:33:42,907 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: Received result from client:site-2 for task PSI \n", + "2023-02-13 19:33:42,910 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: finished processing client result by DhPSIController\n", + "2023-02-13 19:33:42,913 - SimulatorServer - INFO - received update from simulator_server_site-1_0 (986 Bytes, 1676334822 seconds)\n", + "2023-02-13 19:33:42,915 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job]: got result from client site-1 for task: name=PSI, id=d9c01c72-8954-4ac0-9af4-de37876e7d85\n", + "2023-02-13 19:33:42,918 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: invoking result_received_cb ...\n", + "task_name PSI\n", + "2023-02-13 19:33:42,926 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: Processing PSI, None result from client site-1\n", + "2023-02-13 19:33:42,928 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: Received result from client:site-1 for task PSI \n", + "2023-02-13 19:33:42,930 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: finished processing client result by DhPSIController\n", + "2023-02-13 19:33:42,913 - Communicator - INFO - Received comments: simulator_server Received from site-2 (986 Bytes, 1676334822 seconds). SubmitUpdate time: 0.030980825424194336 seconds\n", + "2023-02-13 19:33:42,916 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568]: result sent to server for task: name=PSI, id=ed5b48be-0b5a-4297-b6b7-1c664ca8e568\n", + "2023-02-13 19:33:42,916 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:33:42,932 - Communicator - INFO - Received comments: simulator_server Received from site-1 (986 Bytes, 1676334822 seconds). SubmitUpdate time: 0.07281780242919922 seconds\n", + "2023-02-13 19:33:42,934 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=d9c01c72-8954-4ac0-9af4-de37876e7d85]: result sent to server for task: name=PSI, id=d9c01c72-8954-4ac0-9af4-de37876e7d85\n", + "2023-02-13 19:33:42,935 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:33:43,169 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: task PSI exit with status TaskCompletionStatus.OK\n", + "2023-02-13 19:33:43,174 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: PSI_PREPARE results = {'site-2': , 'site-1': }\n", + "2023-02-13 19:33:43,176 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: start workflow\n", + "2023-02-13 19:33:43,178 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: order sites = [SiteSize(name='site-2', size=50000), SiteSize(name='site-1', size=50000)]\n", + "2023-02-13 19:33:43,181 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: target_sites: [SiteSize(name='site-2', size=50000), SiteSize(name='site-1', size=50000)]\n", + "2023-02-13 19:33:43,183 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: scheduled task PSI\n", + "2023-02-13 19:33:43,185 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: wait for client site-2 task\n", + "2023-02-13 19:33:44,957 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: assigned task to client site-2: name=PSI, id=b04731da-5979-4710-8c2a-60db263379bb\n", + "2023-02-13 19:33:44,960 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: sent task assignment to client\n", + "2023-02-13 19:33:44,963 - SimulatorServer - INFO - GetTask: Return task: PSI to client: site-2 (429e493d-7565-45e5-a7fa-a9021decb438) \n", + "2023-02-13 19:33:44,959 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:33:44,966 - Communicator - INFO - Received from simulator_server server (576 Bytes). getTask time: 0.043691158294677734 seconds\n", + "2023-02-13 19:33:44,969 - FederatedClient - INFO - pull_task completed. Task name:PSI Status:True \n", + "2023-02-13 19:33:44,970 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=PSI, id=b04731da-5979-4710-8c2a-60db263379bb\n", + "2023-02-13 19:33:44,971 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: invoking task executor \n", + "2023-02-13 19:33:44,971 - DhPSIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: Executing task 'PSI' for site-2\n", + "2023-02-13 19:33:44,972 - DhPSIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: Executing psi_stage_task PSI_SETUP for site-2\n", + "2023-02-13 19:33:47,071 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:33:49,145 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:33:50,648 - SimulatorServer - INFO - received update from simulator_server_site-2_0 (471236 Bytes, 1676334830 seconds)\n", + "2023-02-13 19:33:50,651 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job]: got result from client site-2 for task: name=PSI, id=b04731da-5979-4710-8c2a-60db263379bb\n", + "2023-02-13 19:33:50,654 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: invoking result_received_cb ...\n", + "task_name PSI\n", + "2023-02-13 19:33:50,662 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: Processing PSI, None result from client site-2\n", + "2023-02-13 19:33:50,664 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: Received result from client:site-2 for task PSI \n", + "2023-02-13 19:33:50,666 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: finished processing client result by DhPSIController\n", + "2023-02-13 19:33:50,681 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: task PSI exit with status TaskCompletionStatus.OK\n", + "2023-02-13 19:33:50,695 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: scheduled task PSI\n", + "2023-02-13 19:33:50,697 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: wait for client site-1 task\n", + "2023-02-13 19:33:50,564 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: finished processing task\n", + "2023-02-13 19:33:50,565 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-13 19:33:50,566 - Communicator - INFO - Send submitUpdate to simulator_server server\n", + "2023-02-13 19:33:50,669 - Communicator - INFO - Received comments: simulator_server Received from site-2 (471236 Bytes, 1676334830 seconds). SubmitUpdate time: 0.10220599174499512 seconds\n", + "2023-02-13 19:33:50,672 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=b04731da-5979-4710-8c2a-60db263379bb]: result sent to server for task: name=PSI, id=b04731da-5979-4710-8c2a-60db263379bb\n", + "2023-02-13 19:33:50,672 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:33:51,188 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: assigned task to client site-1: name=PSI, id=dc146750-1e4e-4dc5-9219-b4334bffc30c\n", + "2023-02-13 19:33:51,192 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: sent task assignment to client\n", + "2023-02-13 19:33:51,196 - SimulatorServer - INFO - GetTask: Return task: PSI to client: site-1 (58824fe4-990b-4ae1-8355-272c98e8a74b) \n", + "2023-02-13 19:33:51,231 - Communicator - INFO - Received from simulator_server server (470828 Bytes). getTask time: 0.07973718643188477 seconds\n", + "2023-02-13 19:33:51,233 - FederatedClient - INFO - pull_task completed. Task name:PSI Status:True \n", + "2023-02-13 19:33:51,234 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=PSI, id=dc146750-1e4e-4dc5-9219-b4334bffc30c\n", + "2023-02-13 19:33:51,234 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: invoking task executor \n", + "2023-02-13 19:33:51,234 - DhPSIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: Executing task 'PSI' for site-1\n", + "2023-02-13 19:33:51,234 - DhPSIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: Executing psi_stage_task PSI_REQUEST for site-1\n", + "2023-02-13 19:33:52,729 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:33:54,777 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:33:56,862 - SimulatorServer - INFO - received update from simulator_server_site-1_0 (1750995 Bytes, 1676334836 seconds)\n", + "2023-02-13 19:33:56,864 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job]: got result from client site-1 for task: name=PSI, id=dc146750-1e4e-4dc5-9219-b4334bffc30c\n", + "2023-02-13 19:33:56,866 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: invoking result_received_cb ...\n", + "task_name PSI\n", + "2023-02-13 19:33:56,874 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: Processing PSI, None result from client site-1\n", + "2023-02-13 19:33:56,876 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: Received result from client:site-1 for task PSI \n", + "2023-02-13 19:33:56,878 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: finished processing client result by DhPSIController\n", + "2023-02-13 19:33:56,711 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: finished processing task\n", + "2023-02-13 19:33:56,712 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-13 19:33:56,721 - Communicator - INFO - Send submitUpdate to simulator_server server\n", + "2023-02-13 19:33:56,827 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:33:56,881 - Communicator - INFO - Received comments: simulator_server Received from site-1 (1750995 Bytes, 1676334836 seconds). SubmitUpdate time: 0.1594696044921875 seconds\n", + "2023-02-13 19:33:56,884 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=dc146750-1e4e-4dc5-9219-b4334bffc30c]: result sent to server for task: name=PSI, id=dc146750-1e4e-4dc5-9219-b4334bffc30c\n", + "2023-02-13 19:33:56,884 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:33:57,191 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: task PSI exit with status TaskCompletionStatus.OK\n", + "2023-02-13 19:33:57,206 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: scheduled task PSI\n", + "2023-02-13 19:33:57,208 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: wait for client site-2 task\n", + "2023-02-13 19:33:59,087 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: assigned task to client site-2: name=PSI, id=61404bac-0296-4b28-9dba-c7b831062777\n", + "2023-02-13 19:33:59,091 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: sent task assignment to client\n", + "2023-02-13 19:33:59,100 - SimulatorServer - INFO - GetTask: Return task: PSI to client: site-2 (429e493d-7565-45e5-a7fa-a9021decb438) \n", + "2023-02-13 19:33:59,089 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:33:59,222 - Communicator - INFO - Received from simulator_server server (1750588 Bytes). getTask time: 0.39040231704711914 seconds\n", + "2023-02-13 19:33:59,224 - FederatedClient - INFO - pull_task completed. Task name:PSI Status:True \n", + "2023-02-13 19:33:59,225 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=PSI, id=61404bac-0296-4b28-9dba-c7b831062777\n", + "2023-02-13 19:33:59,226 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: invoking task executor \n", + "2023-02-13 19:33:59,226 - DhPSIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: Executing task 'PSI' for site-2\n", + "2023-02-13 19:33:59,227 - DhPSIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: Executing psi_stage_task PSI_RESPONSE for site-2\n", + "2023-02-13 19:34:01,138 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:02,547 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: finished processing task\n", + "2023-02-13 19:34:02,548 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-13 19:34:02,557 - Communicator - INFO - Send submitUpdate to simulator_server server\n", + "2023-02-13 19:34:02,765 - SimulatorServer - INFO - received update from simulator_server_site-2_0 (1750994 Bytes, 1676334842 seconds)\n", + "2023-02-13 19:34:02,768 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job]: got result from client site-2 for task: name=PSI, id=61404bac-0296-4b28-9dba-c7b831062777\n", + "2023-02-13 19:34:02,770 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: invoking result_received_cb ...\n", + "task_name PSI\n", + "2023-02-13 19:34:02,778 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: Processing PSI, None result from client site-2\n", + "2023-02-13 19:34:02,780 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: Received result from client:site-2 for task PSI \n", + "2023-02-13 19:34:02,781 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: finished processing client result by DhPSIController\n", + "2023-02-13 19:34:02,784 - Communicator - INFO - Received comments: simulator_server Received from site-2 (1750994 Bytes, 1676334842 seconds). SubmitUpdate time: 0.22668766975402832 seconds\n", + "2023-02-13 19:34:02,786 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=61404bac-0296-4b28-9dba-c7b831062777]: result sent to server for task: name=PSI, id=61404bac-0296-4b28-9dba-c7b831062777\n", + "2023-02-13 19:34:02,787 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:34:03,202 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: task PSI exit with status TaskCompletionStatus.OK\n", + "2023-02-13 19:34:03,218 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: scheduled task PSI\n", + "2023-02-13 19:34:03,221 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: wait for client site-1 task\n", + "2023-02-13 19:34:03,189 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:04,845 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:34:05,261 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: assigned task to client site-1: name=PSI, id=2951f830-7687-4270-9167-8838b251092d\n", + "2023-02-13 19:34:05,263 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: sent task assignment to client\n", + "2023-02-13 19:34:05,273 - SimulatorServer - INFO - GetTask: Return task: PSI to client: site-1 (58824fe4-990b-4ae1-8355-272c98e8a74b) \n", + "2023-02-13 19:34:05,400 - Communicator - INFO - Received from simulator_server server (1750593 Bytes). getTask time: 0.20580339431762695 seconds\n", + "2023-02-13 19:34:05,401 - FederatedClient - INFO - pull_task completed. Task name:PSI Status:True \n", + "2023-02-13 19:34:05,403 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=PSI, id=2951f830-7687-4270-9167-8838b251092d\n", + "2023-02-13 19:34:05,403 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: invoking task executor \n", + "2023-02-13 19:34:05,403 - DhPSIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: Executing task 'PSI' for site-1\n", + "2023-02-13 19:34:05,403 - DhPSIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: Executing psi_stage_task PSI_TASK_INTERSECT for site-1\n", + "2023-02-13 19:34:06,954 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:34:08,773 - SimulatorServer - INFO - received update from simulator_server_site-1_0 (986 Bytes, 1676334848 seconds)\n", + "2023-02-13 19:34:08,775 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job]: got result from client site-1 for task: name=PSI, id=2951f830-7687-4270-9167-8838b251092d\n", + "2023-02-13 19:34:08,778 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: invoking result_received_cb ...\n", + "task_name PSI\n", + "2023-02-13 19:34:08,786 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: Processing PSI, None result from client site-1\n", + "2023-02-13 19:34:08,788 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: Received result from client:site-1 for task PSI \n", + "2023-02-13 19:34:08,790 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: finished processing client result by DhPSIController\n", + "2023-02-13 19:34:08,736 - FilePsiWriter - INFO - [identity=site-1, run=simulate_job]: job dir = /tmp/nvflare/cifar10_psi/simulate_job\n", + "2023-02-13 19:34:08,737 - FilePsiWriter - INFO - [identity=site-1, run=simulate_job]: trying to save data to /tmp/nvflare/cifar10_psi/simulate_job/site-1/psi/intersection.txt\n", + "2023-02-13 19:34:08,737 - FilePsiWriter - INFO - [identity=site-1, run=simulate_job]: file /tmp/nvflare/cifar10_psi/simulate_job/site-1/psi/intersection.txt saved\n", + "2023-02-13 19:34:08,738 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: finished processing task\n", + "2023-02-13 19:34:08,739 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-13 19:34:08,740 - Communicator - INFO - Send submitUpdate to simulator_server server\n", + "2023-02-13 19:34:08,793 - Communicator - INFO - Received comments: simulator_server Received from site-1 (986 Bytes, 1676334848 seconds). SubmitUpdate time: 0.05275440216064453 seconds\n", + "2023-02-13 19:34:08,795 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=2951f830-7687-4270-9167-8838b251092d]: result sent to server for task: name=PSI, id=2951f830-7687-4270-9167-8838b251092d\n", + "2023-02-13 19:34:08,795 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:09,009 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:34:09,213 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: task PSI exit with status TaskCompletionStatus.OK\n", + "2023-02-13 19:34:09,230 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: target_sites: [SiteSize(name='site-1', size=10000)]\n", + "2023-02-13 19:34:09,232 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: forward_processed sites {'site-1': 10000}\n", + ",intersect_sites=SiteSize(name='site-1', size=10000)\n", + "ordered sites = [SiteSize(name='site-2', size=50000), SiteSize(name='site-1', size=50000)]\n", + "\n", + "2023-02-13 19:34:09,235 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: scheduled task PSI\n", + "2023-02-13 19:34:10,834 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: assigned task to client site-1: name=PSI, id=dccb7e3f-9eb7-4954-8062-b65769b7d297\n", + "2023-02-13 19:34:10,837 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: sent task assignment to client\n", + "2023-02-13 19:34:10,840 - SimulatorServer - INFO - GetTask: Return task: PSI to client: site-1 (58824fe4-990b-4ae1-8355-272c98e8a74b) \n", + "2023-02-13 19:34:10,843 - Communicator - INFO - Received from simulator_server server (614 Bytes). getTask time: 0.04214978218078613 seconds\n", + "2023-02-13 19:34:10,847 - FederatedClient - INFO - pull_task completed. Task name:PSI Status:True \n", + "2023-02-13 19:34:10,848 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=PSI, id=dccb7e3f-9eb7-4954-8062-b65769b7d297\n", + "2023-02-13 19:34:10,848 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: invoking task executor \n", + "2023-02-13 19:34:10,849 - DhPSIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: Executing task 'PSI' for site-1\n", + "2023-02-13 19:34:10,849 - DhPSIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: Executing psi_stage_task PSI_SETUP for site-1\n", + "2023-02-13 19:34:11,084 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:34:12,129 - SimulatorServer - INFO - received update from simulator_server_site-1_0 (471243 Bytes, 1676334852 seconds)\n", + "2023-02-13 19:34:12,131 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job]: got result from client site-1 for task: name=PSI, id=dccb7e3f-9eb7-4954-8062-b65769b7d297\n", + "2023-02-13 19:34:12,133 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: invoking result_received_cb ...\n", + "task_name PSI\n", + "2023-02-13 19:34:12,140 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: Processing PSI, None result from client site-1\n", + "2023-02-13 19:34:12,142 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: Received result from client:site-1 for task PSI \n", + "2023-02-13 19:34:12,143 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: finished processing client result by DhPSIController\n", + "2023-02-13 19:34:11,967 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: finished processing task\n", + "2023-02-13 19:34:11,968 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-13 19:34:11,970 - Communicator - INFO - Send submitUpdate to simulator_server server\n", + "2023-02-13 19:34:12,145 - Communicator - INFO - Received comments: simulator_server Received from site-1 (471243 Bytes, 1676334852 seconds). SubmitUpdate time: 0.1749711036682129 seconds\n", + "2023-02-13 19:34:12,147 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=dccb7e3f-9eb7-4954-8062-b65769b7d297]: result sent to server for task: name=PSI, id=dccb7e3f-9eb7-4954-8062-b65769b7d297\n", + "2023-02-13 19:34:12,148 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:12,219 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: task PSI exit with status TaskCompletionStatus.OK\n", + "2023-02-13 19:34:12,241 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: scheduled task PSI\n", + "2023-02-13 19:34:12,243 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: wait for client site-2 task\n", + "2023-02-13 19:34:13,136 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: assigned task to client site-2: name=PSI, id=2c505dfc-5ad3-4658-befe-e8872fcd6791\n", + "2023-02-13 19:34:13,139 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: sent task assignment to client\n", + "2023-02-13 19:34:13,143 - SimulatorServer - INFO - GetTask: Return task: PSI to client: site-2 (429e493d-7565-45e5-a7fa-a9021decb438) \n", + "2023-02-13 19:34:13,269 - Communicator - INFO - Received from simulator_server server (470828 Bytes). getTask time: 0.1789407730102539 seconds\n", + "2023-02-13 19:34:13,272 - FederatedClient - INFO - pull_task completed. Task name:PSI Status:True \n", + "2023-02-13 19:34:13,272 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=PSI, id=2c505dfc-5ad3-4658-befe-e8872fcd6791\n", + "2023-02-13 19:34:13,273 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: invoking task executor \n", + "2023-02-13 19:34:13,273 - DhPSIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: Executing task 'PSI' for site-2\n", + "2023-02-13 19:34:13,273 - DhPSIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: Executing psi_stage_task PSI_REQUEST for site-2\n", + "2023-02-13 19:34:14,233 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:16,287 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:18,336 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:18,949 - SimulatorServer - INFO - received update from simulator_server_site-2_0 (1750995 Bytes, 1676334858 seconds)\n", + "2023-02-13 19:34:18,951 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job]: got result from client site-2 for task: name=PSI, id=2c505dfc-5ad3-4658-befe-e8872fcd6791\n", + "2023-02-13 19:34:18,954 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: invoking result_received_cb ...\n", + "task_name PSI\n", + "2023-02-13 19:34:18,961 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: Processing PSI, None result from client site-2\n", + "2023-02-13 19:34:18,963 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: Received result from client:site-2 for task PSI \n", + "2023-02-13 19:34:18,965 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: finished processing client result by DhPSIController\n", + "2023-02-13 19:34:18,777 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: finished processing task\n", + "2023-02-13 19:34:18,778 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-13 19:34:18,785 - Communicator - INFO - Send submitUpdate to simulator_server server\n", + "2023-02-13 19:34:18,967 - Communicator - INFO - Received comments: simulator_server Received from site-2 (1750995 Bytes, 1676334858 seconds). SubmitUpdate time: 0.18248295783996582 seconds\n", + "2023-02-13 19:34:18,970 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=2c505dfc-5ad3-4658-befe-e8872fcd6791]: result sent to server for task: name=PSI, id=2c505dfc-5ad3-4658-befe-e8872fcd6791\n", + "2023-02-13 19:34:18,971 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:34:19,230 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: task PSI exit with status TaskCompletionStatus.OK\n", + "2023-02-13 19:34:19,252 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: scheduled task PSI\n", + "2023-02-13 19:34:20,374 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: assigned task to client site-1: name=PSI, id=4214c09e-f294-40ef-8bd6-44171211a436\n", + "2023-02-13 19:34:20,377 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: sent task assignment to client\n", + "2023-02-13 19:34:20,386 - SimulatorServer - INFO - GetTask: Return task: PSI to client: site-1 (58824fe4-990b-4ae1-8355-272c98e8a74b) \n", + "2023-02-13 19:34:20,505 - Communicator - INFO - Received from simulator_server server (1750600 Bytes). getTask time: 0.16515851020812988 seconds\n", + "2023-02-13 19:34:20,506 - FederatedClient - INFO - pull_task completed. Task name:PSI Status:True \n", + "2023-02-13 19:34:20,507 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=PSI, id=4214c09e-f294-40ef-8bd6-44171211a436\n", + "2023-02-13 19:34:20,507 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: invoking task executor \n", + "2023-02-13 19:34:20,508 - DhPSIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: Executing task 'PSI' for site-1\n", + "2023-02-13 19:34:20,508 - DhPSIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: Executing psi_stage_task PSI_RESPONSE for site-1\n", + "2023-02-13 19:34:21,018 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:34:23,085 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:34:23,974 - SimulatorServer - INFO - received update from simulator_server_site-1_0 (1751002 Bytes, 1676334863 seconds)\n", + "2023-02-13 19:34:23,977 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job]: got result from client site-1 for task: name=PSI, id=4214c09e-f294-40ef-8bd6-44171211a436\n", + "2023-02-13 19:34:23,980 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: invoking result_received_cb ...\n", + "task_name PSI\n", + "2023-02-13 19:34:23,989 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: Processing PSI, None result from client site-1\n", + "2023-02-13 19:34:23,990 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: Received result from client:site-1 for task PSI \n", + "2023-02-13 19:34:23,992 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: finished processing client result by DhPSIController\n", + "2023-02-13 19:34:23,803 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: finished processing task\n", + "2023-02-13 19:34:23,805 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-13 19:34:23,809 - Communicator - INFO - Send submitUpdate to simulator_server server\n", + "2023-02-13 19:34:23,995 - Communicator - INFO - Received comments: simulator_server Received from site-1 (1751002 Bytes, 1676334863 seconds). SubmitUpdate time: 0.18543601036071777 seconds\n", + "2023-02-13 19:34:23,997 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=4214c09e-f294-40ef-8bd6-44171211a436]: result sent to server for task: name=PSI, id=4214c09e-f294-40ef-8bd6-44171211a436\n", + "2023-02-13 19:34:23,998 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:24,240 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: task PSI exit with status TaskCompletionStatus.OK\n", + "2023-02-13 19:34:24,260 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: scheduled task PSI\n", + "2023-02-13 19:34:24,262 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: wait for client site-2 task\n", + "2023-02-13 19:34:29,566 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: assigned task to client site-2: name=PSI, id=75d5dd8e-0753-4516-bd52-cda77a401411\n", + "2023-02-13 19:34:29,569 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: sent task assignment to client\n", + "2023-02-13 19:34:29,579 - SimulatorServer - INFO - GetTask: Return task: PSI to client: site-2 (429e493d-7565-45e5-a7fa-a9021decb438) \n", + "2023-02-13 19:34:29,570 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:29,705 - Communicator - INFO - Received from simulator_server server (1750593 Bytes). getTask time: 4.615186452865601 seconds\n", + "2023-02-13 19:34:29,707 - FederatedClient - INFO - pull_task completed. Task name:PSI Status:True \n", + "2023-02-13 19:34:29,708 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=PSI, id=75d5dd8e-0753-4516-bd52-cda77a401411\n", + "2023-02-13 19:34:29,709 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: invoking task executor \n", + "2023-02-13 19:34:29,710 - DhPSIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: Executing task 'PSI' for site-2\n", + "2023-02-13 19:34:29,710 - DhPSIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: Executing psi_stage_task PSI_TASK_INTERSECT for site-2\n", + "2023-02-13 19:34:31,679 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:33,099 - SimulatorServer - INFO - received update from simulator_server_site-2_0 (986 Bytes, 1676334873 seconds)\n", + "2023-02-13 19:34:33,101 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job]: got result from client site-2 for task: name=PSI, id=75d5dd8e-0753-4516-bd52-cda77a401411\n", + "2023-02-13 19:34:33,104 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: invoking result_received_cb ...\n", + "task_name PSI\n", + "2023-02-13 19:34:33,112 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: Processing PSI, None result from client site-2\n", + "2023-02-13 19:34:33,114 - BroadcastAndWait - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: Received result from client:site-2 for task PSI \n", + "2023-02-13 19:34:33,116 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: finished processing client result by DhPSIController\n", + "2023-02-13 19:34:33,255 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: task PSI exit with status TaskCompletionStatus.OK\n", + "2023-02-13 19:34:33,058 - FilePsiWriter - INFO - [identity=site-2, run=simulate_job]: job dir = /tmp/nvflare/cifar10_psi/simulate_job\n", + "2023-02-13 19:34:33,059 - FilePsiWriter - INFO - [identity=site-2, run=simulate_job]: trying to save data to /tmp/nvflare/cifar10_psi/simulate_job/site-2/psi/intersection.txt\n", + "2023-02-13 19:34:33,060 - FilePsiWriter - INFO - [identity=site-2, run=simulate_job]: file /tmp/nvflare/cifar10_psi/simulate_job/site-2/psi/intersection.txt saved\n", + "2023-02-13 19:34:33,061 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: finished processing task\n", + "2023-02-13 19:34:33,063 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-13 19:34:33,064 - Communicator - INFO - Send submitUpdate to simulator_server server\n", + "2023-02-13 19:34:33,119 - Communicator - INFO - Received comments: simulator_server Received from site-2 (986 Bytes, 1676334873 seconds). SubmitUpdate time: 0.05543828010559082 seconds\n", + "2023-02-13 19:34:33,122 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=PSI, task_id=75d5dd8e-0753-4516-bd52-cda77a401411]: result sent to server for task: name=PSI, id=75d5dd8e-0753-4516-bd52-cda77a401411\n", + "2023-02-13 19:34:33,122 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:34:33,274 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: received intersections : {'site-2': 10000} \n", + "2023-02-13 19:34:33,276 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: parallel_back_pass took 24041.575205003028 (ms)\n", + "2023-02-13 19:34:33,278 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: backward_processed sites {'site-2': 10000}\n", + ",intersect_sites=SiteSize(name='site-1', size=10000)\n", + "ordered sites = [SiteSize(name='site-2', size=50000), SiteSize(name='site-1', size=50000)]\n", + "\n", + "2023-02-13 19:34:33,280 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: Intersection calculation succeed\n", + "2023-02-13 19:34:33,282 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: 'forward_pass' took 26051.56308300502 ms.\n", + "2023-02-13 19:34:33,283 - DhPSIWorkFlow - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: 'backward_pass' took 24043.459948996315 ms.\n", + "2023-02-13 19:34:33,285 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: start post workflow\n", + "2023-02-13 19:34:33,288 - DhPSIController - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: task PSI control flow end.\n", + "2023-02-13 19:34:33,290 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: Workflow: DhPSIController finalizing ...\n", + "2023-02-13 19:34:33,743 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job]: no current workflow - asked client to try again later\n", + "2023-02-13 19:34:33,749 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:35,220 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-2, peer_run=simulate_job]: server runner is finalizing - asked client to end the run\n", + "2023-02-13 19:34:35,223 - SimulatorServer - INFO - GetTask: Return task: __end_run__ to client: site-2 (429e493d-7565-45e5-a7fa-a9021decb438) \n", + "2023-02-13 19:34:35,226 - Communicator - INFO - Received from simulator_server server (348 Bytes). getTask time: 0.09840011596679688 seconds\n", + "2023-02-13 19:34:35,229 - FederatedClient - INFO - pull_task completed. Task name:__end_run__ Status:True \n", + "2023-02-13 19:34:35,230 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: server asked to end the run\n", + "2023-02-13 19:34:35,230 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-13 19:34:35,230 - ClientTaskWorker - INFO - End the Simulator run.\n", + "2023-02-13 19:34:35,231 - ClientTaskWorker - INFO - Clean up ClientRunner for : site-2 \n", + "2023-02-13 19:34:35,761 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: ABOUT_TO_END_RUN fired\n", + "2023-02-13 19:34:35,796 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController, peer=site-1, peer_run=simulate_job]: server runner is finalizing - asked client to end the run\n", + "2023-02-13 19:34:35,799 - SimulatorServer - INFO - GetTask: Return task: __end_run__ to client: site-1 (58824fe4-990b-4ae1-8355-272c98e8a74b) \n", + "2023-02-13 19:34:35,807 - FederatedClient - INFO - Shutting down client run: site-1\n", + "2023-02-13 19:34:35,803 - Communicator - INFO - Received from simulator_server server (348 Bytes). getTask time: 0.046799659729003906 seconds\n", + "2023-02-13 19:34:35,805 - FederatedClient - INFO - pull_task completed. Task name:__end_run__ Status:True \n", + "2023-02-13 19:34:35,806 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: server asked to end the run\n", + "2023-02-13 19:34:35,806 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-13 19:34:35,806 - ClientTaskWorker - INFO - End the Simulator run.\n", + "2023-02-13 19:34:35,807 - ClientTaskWorker - INFO - Clean up ClientRunner for : site-1 \n", + "2023-02-13 19:34:42,746 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: END_RUN fired\n", + "2023-02-13 19:34:42,749 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=DhPSIController]: Server runner finished.\n", + "2023-02-13 19:34:44,714 - SimulatorServer - INFO - Server app stopped.\n", + "\n", + "\n", + "2023-02-13 19:34:44,718 - SimulatorServer - INFO - shutting down server\n", + "2023-02-13 19:34:44,721 - SimulatorServer - INFO - canceling sync locks\n", + "2023-02-13 19:34:44,724 - SimulatorServer - INFO - server off\n", + "Simulator finished with run_status 0\n" + ] + } + ], + "source": [ + "import os\n", + "#from nvflare import SimulatorRunner\n", + "from nvflare.private.fed.app.simulator.simulator_runner import SimulatorRunner\n", + "\n", + "simulator = SimulatorRunner(\n", + " job_folder=f\"job_configs/cifar10_psi\",\n", + " workspace=\"/tmp/nvflare/cifar10_psi\",\n", + " n_clients=2,\n", + " threads=2\n", + ")\n", + "run_status = simulator.run()\n", + "print(\"Simulator finished with run_status\", run_status)" + ] + }, + { + "cell_type": "markdown", + "id": "ed1388dc-6a4f-4965-a09f-4d058fc3833c", + "metadata": {}, + "source": [ + "The result will be saved on each client's working directory in `intersection.txt`.\n", + "\n", + "We can check the correctness of the result by comparing to the generate ground truth overlap, saved in `overlap.npy`." + ] + }, + { + "cell_type": "markdown", + "id": "dedb6bcc-9443-4331-bde3-4576fbfffaec", + "metadata": {}, + "source": [ + "### Check the PSI result\n", + "We can check the correctness of the result by comparing to the generate ground truth overlap, saved in overlap.npy." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "21a6b36f-649f-4e19-ba0a-5dd71dfda5dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gt_overlap [11841 19602 45519 ... 47278 37020 2217] n=10000\n", + "psi_overlap_1 [ 4481. 45431. 46253. ... 34846. 179. 7277.] n=10000\n", + "psi_overlap_2 [38639. 10733. 31911. ... 12172. 46167. 865.] n=10000\n", + "Found 100.0% of the overlapping sample ids for site-1.\n", + "Found 100.0% of the overlapping sample ids for site-2.\n" + ] + } + ], + "source": [ + "import os\n", + "import numpy as np\n", + "\n", + "split_dir = os.environ[\"SPLIT_DIR\"]\n", + "gt_overlap = np.load(os.path.join(split_dir, \"overlap.npy\"))\n", + "\n", + "psi_overlap_1 = np.loadtxt(\"/tmp/nvflare/cifar10_psi/simulate_job/site-1/psi/intersection.txt\")\n", + "psi_overlap_2 = np.loadtxt(\"/tmp/nvflare/cifar10_psi/simulate_job/site-2/psi/intersection.txt\")\n", + " \n", + "print(\"gt_overlap\", gt_overlap, f\"n={len(gt_overlap)}\")\n", + "print(\"psi_overlap_1\", psi_overlap_1, f\"n={len(psi_overlap_1)}\")\n", + "print(\"psi_overlap_2\", psi_overlap_2, f\"n={len(psi_overlap_2)}\")\n", + "\n", + "intersect_1 = np.intersect1d(psi_overlap_1, gt_overlap, assume_unique=True)\n", + "intersect_2 = np.intersect1d(psi_overlap_2, gt_overlap, assume_unique=True)\n", + "\n", + "print(f\"Found {100*len(intersect_1)/len(gt_overlap):.1f}% of the overlapping sample ids for site-1.\")\n", + "print(f\"Found {100*len(intersect_2)/len(gt_overlap):.1f}% of the overlapping sample ids for site-2.\")" + ] + }, + { + "cell_type": "markdown", + "id": "bd0713e2-e393-41c0-9da0-392535cf8a54", + "metadata": {}, + "source": [ + "## 3. Run simulated split-learning experiments\n", + "Next we use the `intersection.txt` files to align the datasets on each participating site in order to do split learning.\n", + "The [config_fed_client.json](./job_configs/cifar10_splitnn/site-1/config/config_fed_client.json) takes as input the previously generated intersection file for each site.\n", + "```\n", + " {\n", + " \"id\": \"cifar10-learner\",\n", + " \"path\": \"pt.learners.cifar10_learner_splitnn.CIFAR10LearnerSplitNN\",\n", + " \"args\": {\n", + " \"dataset_root\": \"{DATASET_ROOT}\",\n", + " \"intersection_file\": \"{INTERSECTION_FILE}\",\n", + " \"lr\": 1e-2,\n", + " \"model\": {\"path\": \"pt.networks.split_nn.SplitNN\", \"args\": {\"split_id\": 0}},\n", + " \"timeit\": true\n", + " }\n", + " }\n", + "```\n", + "To set the filename automatically, run:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3978f6ac-f7db-4648-abb3-0fd071f01531", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Modified job_configs/cifar10_splitnn/site-1/config/config_fed_client.json to use INTERSECTION_FILE=/tmp/nvflare/cifar10_psi/simulate_job/site-1/psi/intersection.txt\n", + "Modified job_configs/cifar10_splitnn/site-2/config/config_fed_client.json to use INTERSECTION_FILE=/tmp/nvflare/cifar10_psi/simulate_job/site-2/psi/intersection.txt\n" + ] + } + ], + "source": [ + "!for i in {1..2}; \\\n", + "do \\\n", + " CONFIG_FILE=job_configs/cifar10_splitnn/site-${i}/config/config_fed_client.json; \\\n", + " INTERSECTION_FILE=/tmp/nvflare/cifar10_psi/simulate_job/site-${i}/psi/intersection.txt; \\\n", + " python3 ./set_intersection_file.py --config_file ${CONFIG_FILE} --intersection_file ${INTERSECTION_FILE}; \\\n", + "done" + ] + }, + { + "cell_type": "markdown", + "id": "d3f606a9-55a9-4984-a40c-7951287a5a63", + "metadata": {}, + "source": [ + "To run the experiment, execute:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "33c75dcb-014d-40c4-8a4a-7a53847c486b", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-02-01 11:21:52,457 - SimulatorRunner - INFO - Create the Simulator Server.\n", + "2023-02-01 11:21:52,479 - Cell - INFO - server: creating listener on grpc://localhost:53913\n", + "2023-02-01 11:21:52,481 - Cell - INFO - server: created backbone external listener for grpc://localhost:53913\n", + "2023-02-01 11:21:52,482 - ConnectorManager - INFO - 349396: Try start_listener Listener resources: {'secure': False, 'host': 'localhost', 'ports': ['30000-40000']}\n", + "2023-02-01 11:21:52,483 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector TcpDriver:35a77933-6e70-4677-9204-9b715b7ba6d1 is starting in PASSIVE mode\n", + "2023-02-01 11:21:52,985 - Cell - INFO - server: created backbone internal listener for tcp://localhost:30342\n", + "2023-02-01 11:21:52,989 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector AioGrpcDriver:707ec2a5-2630-4497-ac3f-a125e9ff4d1b is starting in PASSIVE mode\n", + "2023-02-01 11:21:52,993 - nvflare.fuel.f3.communicator - INFO - Communicator is started for local endpoint: server\n", + "2023-02-01 11:21:53,091 - nvflare.fuel.hci.server.hci - INFO - Starting Admin Server localhost on Port 49897\n", + "2023-02-01 11:21:53,092 - SimulatorRunner - INFO - Deploy the Apps.\n", + "2023-02-01 11:21:53,102 - SimulatorRunner - INFO - Create the simulate clients.\n", + "2023-02-01 11:21:53,112 - Cell - INFO - site-1: created backbone external connector to grpc://localhost:53913\n", + "2023-02-01 11:21:53,114 - ConnectorManager - INFO - 349396: Try start_listener Listener resources: {'secure': False, 'host': 'localhost', 'ports': ['30000-40000']}\n", + "2023-02-01 11:21:53,116 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector TcpDriver:086ec5ec-6712-4c0f-a4d7-2749e72c14fc is starting in PASSIVE mode\n", + "2023-02-01 11:21:53,619 - Cell - INFO - site-1: created backbone internal listener for tcp://localhost:39242\n", + "2023-02-01 11:21:53,622 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector AioGrpcDriver:e7cb552c-52f3-42cf-b678-0249c46602f5 is starting in ACTIVE mode\n", + "2023-02-01 11:21:53,624 - nvflare.fuel.f3.communicator - INFO - Communicator is started for local endpoint: site-1\n", + "2023-02-01 11:21:53,836 - ClientManager - INFO - Client: New client site-1@192.168.0.17 joined. Sent token: 9cfea89f-98ea-44ef-bd44-767416be09ca. Total clients: 1\n", + "2023-02-01 11:21:53,840 - FederatedClient - INFO - Successfully registered client:site-1 for project simulator_server. Token:9cfea89f-98ea-44ef-bd44-767416be09ca SSID:\n", + "2023-02-01 11:21:53,857 - Cell - INFO - site-2: created backbone external connector to grpc://localhost:53913\n", + "2023-02-01 11:21:53,860 - ConnectorManager - INFO - 349396: Try start_listener Listener resources: {'secure': False, 'host': 'localhost', 'ports': ['30000-40000']}\n", + "2023-02-01 11:21:53,864 - SimulatorServer - INFO - These jobs: simulate_job are not running on the server. Ask client: site-1 to abort these runs.\n", + "2023-02-01 11:21:53,866 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector TcpDriver:704de96b-aa2a-497c-a46e-ffcf2ee0b2a2 is starting in PASSIVE mode\n", + "2023-02-01 11:21:53,871 - Communicator - INFO - Failed to clean up the runs: simulate_job\n", + "2023-02-01 11:21:54,374 - Cell - INFO - site-2: created backbone internal listener for tcp://localhost:37420\n", + "2023-02-01 11:21:54,377 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector AioGrpcDriver:68cd467d-5eee-4d28-a6b1-8864e0e39b27 is starting in ACTIVE mode\n", + "2023-02-01 11:21:54,380 - nvflare.fuel.f3.communicator - INFO - Communicator is started for local endpoint: site-2\n", + "2023-02-01 11:21:54,588 - ClientManager - INFO - Client: New client site-2@192.168.0.17 joined. Sent token: c5ac4cf9-6cbf-417e-a49e-b2bdbe7e1e58. Total clients: 2\n", + "2023-02-01 11:21:54,592 - FederatedClient - INFO - Successfully registered client:site-2 for project simulator_server. Token:c5ac4cf9-6cbf-417e-a49e-b2bdbe7e1e58 SSID:\n", + "2023-02-01 11:21:54,596 - SimulatorRunner - INFO - Set the client status ready.\n", + "2023-02-01 11:21:54,599 - SimulatorRunner - INFO - Deploy and start the Server App.\n", + "2023-02-01 11:21:54,613 - Cell - INFO - server.simulate_job: created backbone internal connector to tcp://localhost:30342 on parent\n", + "2023-02-01 11:21:54,615 - ConnectorManager - INFO - 349396: Try start_listener Listener resources: {'secure': False, 'host': 'localhost', 'ports': ['30000-40000']}\n", + "2023-02-01 11:21:54,617 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector TcpDriver:67bc3ef9-8c7e-49cf-b3e4-62e54ad83e05 is starting in PASSIVE mode\n", + "2023-02-01 11:21:55,120 - Cell - INFO - server.simulate_job: created backbone internal listener for tcp://localhost:34253\n", + "2023-02-01 11:21:55,123 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector TcpDriver:05725269-633d-4de6-bc63-0cd8a331862e is starting in ACTIVE mode\n", + "2023-02-01 11:21:55,126 - nvflare.fuel.f3.communicator - INFO - Communicator is started for local endpoint: server.simulate_job\n", + "2023-02-01 11:21:55,130 - ServerCommandAgent - INFO - ServerCommandAgent cell start: server.simulate_job\n", + "2023-02-01 11:21:56,238 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: Server runner starting ...\n", + "2023-02-01 11:21:56,241 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: starting workflow splitnn_ctl () ...\n", + "2023-02-01 11:21:56,245 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: Workflow splitnn_ctl () started\n", + "2023-02-01 11:21:56,248 - SplitNNController - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl]: scheduled task _splitnn_task_init_model_\n", + "2023-02-01 11:21:56,607 - SimulatorClientRunner - INFO - Start the clients run simulation.\n", + "2023-02-01 11:21:57,611 - SimulatorClientRunner - INFO - Simulate Run client: site-1\n", + "2023-02-01 11:21:57,613 - SimulatorClientRunner - INFO - Simulate Run client: site-2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "E0201 11:21:57.617844691 349556 fork_posix.cc:76] Other threads are currently calling into gRPC, skipping fork() handlers\n", + "E0201 11:21:57.634208997 349557 fork_posix.cc:76] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-02-01 11:21:58,720 - Cell - INFO - site-1.simulate_job: created backbone internal connector to tcp://localhost:39242 on parent\n", + "2023-02-01 11:21:58,721 - ConnectorManager - INFO - 349565: Try start_listener Listener resources: {'secure': False, 'host': 'localhost', 'ports': ['30000-40000']}\n", + "2023-02-01 11:21:58,721 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector TcpDriver:a736b5be-2f44-4c08-b49f-f12a5a5dc528 is starting in PASSIVE mode\n", + "2023-02-01 11:21:58,732 - Cell - INFO - site-2.simulate_job: created backbone internal connector to tcp://localhost:37420 on parent\n", + "2023-02-01 11:21:58,733 - ConnectorManager - INFO - 349566: Try start_listener Listener resources: {'secure': False, 'host': 'localhost', 'ports': ['30000-40000']}\n", + "2023-02-01 11:21:58,733 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector TcpDriver:0442591d-0aff-4908-8e48-3e800181502c is starting in PASSIVE mode\n", + "2023-02-01 11:21:59,222 - Cell - INFO - site-1.simulate_job: created backbone internal listener for tcp://localhost:32042\n", + "2023-02-01 11:21:59,223 - Cell - INFO - site-1.simulate_job: created backbone external connector to grpc://localhost:53913\n", + "2023-02-01 11:21:59,224 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector TcpDriver:cd137698-17ed-4c1e-8c90-b3c6ad7f605d is starting in ACTIVE mode\n", + "2023-02-01 11:21:59,224 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector AioGrpcDriver:013fa882-4e54-4cbf-96f3-ba4287b2befb is starting in ACTIVE mode\n", + "2023-02-01 11:21:59,226 - nvflare.fuel.f3.communicator - INFO - Communicator is started for local endpoint: site-1.simulate_job\n", + "2023-02-01 11:21:59,234 - Cell - INFO - site-2.simulate_job: created backbone internal listener for tcp://localhost:36124\n", + "2023-02-01 11:21:59,235 - Cell - INFO - site-2.simulate_job: created backbone external connector to grpc://localhost:53913\n", + "2023-02-01 11:21:59,236 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector TcpDriver:5cca3def-0a92-4b90-9e29-cb19ba8126a8 is starting in ACTIVE mode\n", + "2023-02-01 11:21:59,236 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector AioGrpcDriver:c29240f2-00e7-4624-8b6e-e0f5d6f42088 is starting in ACTIVE mode\n", + "2023-02-01 11:21:59,237 - nvflare.fuel.f3.communicator - INFO - Communicator is started for local endpoint: site-2.simulate_job\n", + "2023-02-01 11:22:00,345 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Running model SplitNN(\n", + " (conv_layer): Sequential(\n", + " (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (1): ReLU(inplace=True)\n", + " (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (3): ReLU(inplace=True)\n", + " (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (6): ReLU(inplace=True)\n", + " (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (8): ReLU(inplace=True)\n", + " (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (10): Dropout2d(p=0.05, inplace=False)\n", + " (11): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (12): ReLU(inplace=True)\n", + " (13): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (14): ReLU(inplace=True)\n", + " (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (16): Flatten(start_dim=1, end_dim=-1)\n", + " )\n", + " (fc_layer): Sequential(\n", + " (0): Dropout(p=0.1, inplace=False)\n", + " (1): Linear(in_features=4096, out_features=512, bias=True)\n", + " (2): ReLU(inplace=True)\n", + " (3): Linear(in_features=512, out_features=512, bias=True)\n", + " (4): ReLU(inplace=True)\n", + " (5): Dropout(p=0.1, inplace=False)\n", + " (6): Linear(in_features=512, out_features=10, bias=True)\n", + " )\n", + " (split_forward): Sequential(\n", + " (0): Dropout(p=0.1, inplace=False)\n", + " (1): Linear(in_features=4096, out_features=512, bias=True)\n", + " (2): ReLU(inplace=True)\n", + " (3): Linear(in_features=512, out_features=512, bias=True)\n", + " (4): ReLU(inplace=True)\n", + " (5): Dropout(p=0.1, inplace=False)\n", + " (6): Linear(in_features=512, out_features=10, bias=True)\n", + " )\n", + ")\n", + "2023-02-01 11:22:00,373 - CIFAR10LearnerSplitNN - INFO - [identity=site-1, run=simulate_job]: Running model SplitNN(\n", + " (conv_layer): Sequential(\n", + " (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (1): ReLU(inplace=True)\n", + " (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (3): ReLU(inplace=True)\n", + " (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (6): ReLU(inplace=True)\n", + " (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (8): ReLU(inplace=True)\n", + " (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (10): Dropout2d(p=0.05, inplace=False)\n", + " (11): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (12): ReLU(inplace=True)\n", + " (13): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (14): ReLU(inplace=True)\n", + " (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (16): Flatten(start_dim=1, end_dim=-1)\n", + " )\n", + " (fc_layer): Sequential(\n", + " (0): Dropout(p=0.1, inplace=False)\n", + " (1): Linear(in_features=4096, out_features=512, bias=True)\n", + " (2): ReLU(inplace=True)\n", + " (3): Linear(in_features=512, out_features=512, bias=True)\n", + " (4): ReLU(inplace=True)\n", + " (5): Dropout(p=0.1, inplace=False)\n", + " (6): Linear(in_features=512, out_features=10, bias=True)\n", + " )\n", + " (split_forward): Sequential(\n", + " (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (1): ReLU(inplace=True)\n", + " (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (3): ReLU(inplace=True)\n", + " (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (6): ReLU(inplace=True)\n", + " (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (8): ReLU(inplace=True)\n", + " (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (10): Dropout2d(p=0.05, inplace=False)\n", + " (11): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (12): ReLU(inplace=True)\n", + " (13): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (14): ReLU(inplace=True)\n", + " (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (16): Flatten(start_dim=1, end_dim=-1)\n", + " )\n", + ")\n", + "2023-02-01 11:22:01,710 - CIFAR10LearnerSplitNN - INFO - [identity=site-1, run=simulate_job]: Running `split_id` 0 on site `site-1`\n", + "2023-02-01 11:22:01,712 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Running `split_id` 1 on site `site-2`\n", + "2023-02-01 11:22:02,241 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: assigned task to client site-1: name=_splitnn_task_init_model_, id=7da31d83-51a0-4b91-bc33-77e3b726459a\n", + "2023-02-01 11:22:02,244 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-2, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: assigned task to client site-2: name=_splitnn_task_init_model_, id=d111d39b-e3d5-47b6-900e-140ca47ea1cf\n", + "2023-02-01 11:22:02,246 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: sent task assignment to client\n", + "2023-02-01 11:22:02,248 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-2, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: sent task assignment to client\n", + "2023-02-01 11:22:02,214 - CIFAR10LearnerSplitNN - INFO - [identity=site-1, run=simulate_job]: Training with 10000 overlapping indices of 50000.\n", + "2023-02-01 11:22:02,214 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: client runner started\n", + "2023-02-01 11:22:02,215 - ClientTaskWorker - INFO - Initialize ClientRunner for client: site-1\n", + "2023-02-01 11:22:02,219 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Training with 10000 overlapping indices of 50000.\n", + "2023-02-01 11:22:02,221 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: client runner started\n", + "2023-02-01 11:22:02,221 - ClientTaskWorker - INFO - Initialize ClientRunner for client: site-2\n", + "2023-02-01 11:22:02,926 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job]: got result from client site-1 for task: name=_splitnn_task_init_model_, id=d111d39b-e3d5-47b6-900e-140ca47ea1cf\n", + "2023-02-01 11:22:02,932 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-2, peer_run=simulate_job]: got result from client site-2 for task: name=_splitnn_task_init_model_, id=d111d39b-e3d5-47b6-900e-140ca47ea1cf\n", + "2023-02-01 11:22:02,936 - SplitNNController - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: invoking result_received_cb ...\n", + "2023-02-01 11:22:02,944 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: finished processing client result by splitnn_ctl\n", + "2023-02-01 11:22:02,946 - SplitNNController - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: invoking result_received_cb ...\n", + "2023-02-01 11:22:02,948 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: finished processing client result by splitnn_ctl\n", + "2023-02-01 11:22:02,866 - Communicator - INFO - Received from simulator_server server (13970184 Bytes). getTask: _splitnn_task_init_model_ time: 0.6140961647033691 seconds\n", + "2023-02-01 11:22:02,867 - FederatedClient - INFO - pull_task completed. Task name:_splitnn_task_init_model_ Status:True \n", + "2023-02-01 11:22:02,868 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=_splitnn_task_init_model_, id=d111d39b-e3d5-47b6-900e-140ca47ea1cf\n", + "2023-02-01 11:22:02,868 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: invoking task executor \n", + "2023-02-01 11:22:02,869 - SplitNNLearnerExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: Client trainer got task: _splitnn_task_init_model_\n", + "2023-02-01 11:22:02,869 - SplitNNLearnerExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: Executing task _splitnn_task_init_model_...\n", + "2023-02-01 11:22:02,869 - SplitNNLearnerExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: Initializing model...\n", + "2023-02-01 11:22:02,874 - Communicator - INFO - Received from simulator_server server (13970184 Bytes). getTask: _splitnn_task_init_model_ time: 0.6256439685821533 seconds\n", + "2023-02-01 11:22:02,875 - FederatedClient - INFO - pull_task completed. Task name:_splitnn_task_init_model_ Status:True \n", + "2023-02-01 11:22:02,876 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=_splitnn_task_init_model_, id=d111d39b-e3d5-47b6-900e-140ca47ea1cf\n", + "2023-02-01 11:22:02,877 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: invoking task executor \n", + "2023-02-01 11:22:02,877 - CIFAR10LearnerSplitNN - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: init_model finished.\n", + "2023-02-01 11:22:02,877 - SplitNNLearnerExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: Client trainer got task: _splitnn_task_init_model_\n", + "2023-02-01 11:22:02,877 - SplitNNLearnerExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: Executing task _splitnn_task_init_model_...\n", + "2023-02-01 11:22:02,877 - SplitNNLearnerExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: Initializing model...\n", + "2023-02-01 11:22:02,877 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: finished processing task\n", + "2023-02-01 11:22:02,879 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-01 11:22:02,884 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: init_model finished.\n", + "2023-02-01 11:22:02,885 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: finished processing task\n", + "2023-02-01 11:22:02,887 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-01 11:22:02,948 - Communicator - INFO - SubmitUpdate time: 0.06910395622253418 seconds\n", + "2023-02-01 11:22:02,950 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: result sent to server for task: name=_splitnn_task_init_model_, id=d111d39b-e3d5-47b6-900e-140ca47ea1cf\n", + "2023-02-01 11:22:02,950 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-01 11:22:02,991 - Communicator - INFO - SubmitUpdate time: 0.10436272621154785 seconds\n", + "2023-02-01 11:22:02,993 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=d111d39b-e3d5-47b6-900e-140ca47ea1cf]: result sent to server for task: name=_splitnn_task_init_model_, id=d111d39b-e3d5-47b6-900e-140ca47ea1cf\n", + "2023-02-01 11:22:02,993 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-01 11:22:04,958 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: assigned task to client site-1: name=_splitnn_task_init_model_, id=7da31d83-51a0-4b91-bc33-77e3b726459a\n", + "2023-02-01 11:22:04,962 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: sent task assignment to client\n", + "2023-02-01 11:22:05,278 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job]: got result from client site-1 for task: name=_splitnn_task_init_model_, id=7da31d83-51a0-4b91-bc33-77e3b726459a\n", + "2023-02-01 11:22:05,286 - SplitNNController - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: invoking result_received_cb ...\n", + "2023-02-01 11:22:05,288 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job, peer_rc=OK, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: finished processing client result by splitnn_ctl\n", + "2023-02-01 11:22:05,237 - Communicator - INFO - Received from simulator_server server (13970184 Bytes). getTask: _splitnn_task_init_model_ time: 0.25359487533569336 seconds\n", + "2023-02-01 11:22:05,238 - FederatedClient - INFO - pull_task completed. Task name:_splitnn_task_init_model_ Status:True \n", + "2023-02-01 11:22:05,239 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=_splitnn_task_init_model_, id=7da31d83-51a0-4b91-bc33-77e3b726459a\n", + "2023-02-01 11:22:05,239 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: invoking task executor \n", + "2023-02-01 11:22:05,240 - SplitNNLearnerExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: Client trainer got task: _splitnn_task_init_model_\n", + "2023-02-01 11:22:05,240 - SplitNNLearnerExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: Executing task _splitnn_task_init_model_...\n", + "2023-02-01 11:22:05,240 - SplitNNLearnerExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: Initializing model...\n", + "2023-02-01 11:22:05,247 - CIFAR10LearnerSplitNN - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: init_model finished.\n", + "2023-02-01 11:22:05,248 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: finished processing task\n", + "2023-02-01 11:22:05,250 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-01 11:22:05,293 - Communicator - INFO - SubmitUpdate time: 0.042684078216552734 seconds\n", + "2023-02-01 11:22:05,294 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_init_model_, task_id=7da31d83-51a0-4b91-bc33-77e3b726459a]: result sent to server for task: name=_splitnn_task_init_model_, id=7da31d83-51a0-4b91-bc33-77e3b726459a\n", + "2023-02-01 11:22:05,295 - ClientTaskWorker - INFO - Finished one task run for client: site-1\n", + "2023-02-01 11:22:05,775 - SplitNNController - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl]: task _splitnn_task_init_model_ exit with status TaskCompletionStatus.OK\n", + "2023-02-01 11:22:06,276 - SplitNNController - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl]: scheduled task _splitnn_task_train_\n", + "2023-02-01 11:22:07,302 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=4f9e7bac-73f3-4853-a4c8-17155cc9131c]: assigned task to client site-1: name=_splitnn_task_train_, id=4f9e7bac-73f3-4853-a4c8-17155cc9131c\n", + "2023-02-01 11:22:07,304 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-1, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=4f9e7bac-73f3-4853-a4c8-17155cc9131c]: sent task assignment to client\n", + "2023-02-01 11:22:07,360 - Communicator - INFO - Received from simulator_server server (596 Bytes). getTask: _splitnn_task_train_ time: 0.06227231025695801 seconds\n", + "2023-02-01 11:22:07,362 - FederatedClient - INFO - pull_task completed. Task name:_splitnn_task_train_ Status:True \n", + "2023-02-01 11:22:07,362 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=_splitnn_task_train_, id=4f9e7bac-73f3-4853-a4c8-17155cc9131c\n", + "2023-02-01 11:22:07,363 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=4f9e7bac-73f3-4853-a4c8-17155cc9131c]: invoking task executor \n", + "2023-02-01 11:22:07,363 - SplitNNLearnerExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=4f9e7bac-73f3-4853-a4c8-17155cc9131c]: Client trainer got task: _splitnn_task_train_\n", + "2023-02-01 11:22:07,364 - SplitNNLearnerExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=4f9e7bac-73f3-4853-a4c8-17155cc9131c]: Executing task _splitnn_task_train_...\n", + "2023-02-01 11:22:07,364 - SplitNNLearnerExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=4f9e7bac-73f3-4853-a4c8-17155cc9131c]: Running training...\n", + "2023-02-01 11:22:07,364 - CIFAR10LearnerSplitNN - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=4f9e7bac-73f3-4853-a4c8-17155cc9131c]: Starting training of 15625 rounds with batch size 64\n", + "2023-02-01 11:22:08,570 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 0/15625 train_loss: 2.2996, accuracy: 0.1875\n", + "2023-02-01 11:22:08,736 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 1/15625 train_loss: 2.2998, accuracy: 0.1250\n", + "2023-02-01 11:22:08,833 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 2/15625 train_loss: 2.3019, accuracy: 0.0938\n", + "2023-02-01 11:22:08,928 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 3/15625 train_loss: 2.2990, accuracy: 0.1250\n", + "2023-02-01 11:22:09,005 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 4/15625 train_loss: 2.3012, accuracy: 0.1719\n", + "2023-02-01 11:22:09,104 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 5/15625 train_loss: 2.3051, accuracy: 0.0938\n", + "2023-02-01 11:22:09,183 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 6/15625 train_loss: 2.3009, accuracy: 0.0781\n", + "2023-02-01 11:22:09,258 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 7/15625 train_loss: 2.2994, accuracy: 0.1406\n", + "2023-02-01 11:22:09,338 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 8/15625 train_loss: 2.3051, accuracy: 0.0938\n", + "2023-02-01 11:22:09,419 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 9/15625 train_loss: 2.3072, accuracy: 0.1094\n", + "2023-02-01 11:22:09,515 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 10/15625 train_loss: 2.3032, accuracy: 0.1406\n", + "2023-02-01 11:22:09,611 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 11/15625 train_loss: 2.2972, accuracy: 0.1719\n", + "2023-02-01 11:22:09,711 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 12/15625 train_loss: 2.3047, accuracy: 0.0625\n", + "2023-02-01 11:22:09,815 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 13/15625 train_loss: 2.3006, accuracy: 0.0781\n", + "2023-02-01 11:22:09,899 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 14/15625 train_loss: 2.3106, accuracy: 0.0781\n", + "2023-02-01 11:22:09,981 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 15/15625 train_loss: 2.2925, accuracy: 0.1562\n", + "2023-02-01 11:22:10,095 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 16/15625 train_loss: 2.3110, accuracy: 0.0469\n", + "2023-02-01 11:22:10,192 - Communicator - INFO - Received from simulator_server server (324 Bytes). getTask: __try_again__ time: 0.19057416915893555 seconds\n", + "2023-02-01 11:22:10,195 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-01 11:22:10,205 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 17/15625 train_loss: 2.3002, accuracy: 0.0781\n", + "2023-02-01 11:22:10,305 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 18/15625 train_loss: 2.3045, accuracy: 0.0469\n", + "2023-02-01 11:22:10,383 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 19/15625 train_loss: 2.3049, accuracy: 0.0781\n", + "2023-02-01 11:22:10,463 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 20/15625 train_loss: 2.3010, accuracy: 0.1094\n", + "2023-02-01 11:22:10,571 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 21/15625 train_loss: 2.2996, accuracy: 0.0781\n", + "2023-02-01 11:22:10,680 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 22/15625 train_loss: 2.3045, accuracy: 0.0625\n", + "2023-02-01 11:22:10,786 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 23/15625 train_loss: 2.2984, accuracy: 0.0938\n", + "2023-02-01 11:22:10,886 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 24/15625 train_loss: 2.2991, accuracy: 0.1406\n", + "2023-02-01 11:22:10,982 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 25/15625 train_loss: 2.3000, accuracy: 0.1406\n", + "2023-02-01 11:22:11,083 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 26/15625 train_loss: 2.3016, accuracy: 0.0781\n", + "2023-02-01 11:22:11,160 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 27/15625 train_loss: 2.3033, accuracy: 0.1406\n", + "2023-02-01 11:22:11,238 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 28/15625 train_loss: 2.3003, accuracy: 0.1094\n", + "2023-02-01 11:22:11,333 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 29/15625 train_loss: 2.3048, accuracy: 0.0938\n", + "2023-02-01 11:22:11,412 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 30/15625 train_loss: 2.3044, accuracy: 0.0781\n", + "2023-02-01 11:22:11,488 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 31/15625 train_loss: 2.3041, accuracy: 0.0938\n", + "2023-02-01 11:22:11,571 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 32/15625 train_loss: 2.3091, accuracy: 0.0625\n", + "2023-02-01 11:22:11,652 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 33/15625 train_loss: 2.3038, accuracy: 0.0625\n", + "2023-02-01 11:22:11,730 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 34/15625 train_loss: 2.2982, accuracy: 0.0938\n", + "2023-02-01 11:22:11,809 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 35/15625 train_loss: 2.3044, accuracy: 0.0312\n", + "2023-02-01 11:22:11,884 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 36/15625 train_loss: 2.2980, accuracy: 0.1562\n", + "2023-02-01 11:22:11,964 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 37/15625 train_loss: 2.3053, accuracy: 0.0938\n", + "2023-02-01 11:22:12,202 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-2, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: assigned task to client site-2: name=_splitnn_task_train_, id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a\n", + "2023-02-01 11:22:12,204 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-2, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: sent task assignment to client\n", + "2023-02-01 11:22:12,042 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 38/15625 train_loss: 2.3017, accuracy: 0.1094\n", + "2023-02-01 11:22:12,131 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 39/15625 train_loss: 2.2991, accuracy: 0.0625\n", + "2023-02-01 11:22:12,262 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-2, peer_run=simulate_job]: got result from client site-2 for task: name=_splitnn_task_train_, id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a\n", + "2023-02-01 11:22:12,265 - SplitNNController - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: invoking result_received_cb ...\n", + "2023-02-01 11:22:12,269 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=splitnn_ctl, peer=site-2, peer_run=simulate_job, peer_rc=OK, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: finished processing client result by splitnn_ctl\n", + "2023-02-01 11:22:12,246 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 40/15625 train_loss: 2.2980, accuracy: 0.0938\n", + "2023-02-01 11:22:12,252 - Communicator - INFO - Received from simulator_server server (596 Bytes). getTask: _splitnn_task_train_ time: 0.052910566329956055 seconds\n", + "2023-02-01 11:22:12,253 - FederatedClient - INFO - pull_task completed. Task name:_splitnn_task_train_ Status:True \n", + "2023-02-01 11:22:12,254 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=_splitnn_task_train_, id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a\n", + "2023-02-01 11:22:12,254 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: invoking task executor \n", + "2023-02-01 11:22:12,254 - SplitNNLearnerExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: Client trainer got task: _splitnn_task_train_\n", + "2023-02-01 11:22:12,254 - SplitNNLearnerExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: Executing task _splitnn_task_train_...\n", + "2023-02-01 11:22:12,254 - SplitNNLearnerExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: Running training...\n", + "2023-02-01 11:22:12,254 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: Starting training of 15625 rounds with batch size 64\n", + "2023-02-01 11:22:12,257 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: finished processing task\n", + "2023-02-01 11:22:12,260 - FederatedClient - INFO - Starting to push execute result.\n", + "2023-02-01 11:22:12,272 - Communicator - INFO - SubmitUpdate time: 0.011861801147460938 seconds\n", + "2023-02-01 11:22:12,272 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=_splitnn_task_train_, task_id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a]: result sent to server for task: name=_splitnn_task_train_, id=c2bb97f2-4f43-445f-898a-75f2a4ddd69a\n", + "2023-02-01 11:22:12,273 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-01 11:22:12,334 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 41/15625 train_loss: 2.2961, accuracy: 0.0938\n", + "2023-02-01 11:22:12,428 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 42/15625 train_loss: 2.2986, accuracy: 0.1250\n", + "2023-02-01 11:22:12,505 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 43/15625 train_loss: 2.2979, accuracy: 0.1250\n", + "2023-02-01 11:22:12,582 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 44/15625 train_loss: 2.3029, accuracy: 0.1719\n", + "2023-02-01 11:22:12,656 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 45/15625 train_loss: 2.3036, accuracy: 0.0781\n", + "2023-02-01 11:22:12,736 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 46/15625 train_loss: 2.2970, accuracy: 0.1562\n", + "2023-02-01 11:22:12,808 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 47/15625 train_loss: 2.3106, accuracy: 0.1250\n", + "2023-02-01 11:22:12,884 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 48/15625 train_loss: 2.3053, accuracy: 0.0781\n", + "2023-02-01 11:22:12,987 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 49/15625 train_loss: 2.3070, accuracy: 0.0781\n", + "2023-02-01 11:22:13,061 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 50/15625 train_loss: 2.3063, accuracy: 0.1406\n", + "2023-02-01 11:22:13,153 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 51/15625 train_loss: 2.3036, accuracy: 0.1562\n", + "2023-02-01 11:22:13,229 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 52/15625 train_loss: 2.3036, accuracy: 0.1250\n", + "2023-02-01 11:22:13,303 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 53/15625 train_loss: 2.3077, accuracy: 0.0469\n", + "2023-02-01 11:22:13,400 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 54/15625 train_loss: 2.2983, accuracy: 0.1094\n", + "2023-02-01 11:22:13,495 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 55/15625 train_loss: 2.2989, accuracy: 0.1562\n", + "2023-02-01 11:22:13,594 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 56/15625 train_loss: 2.3015, accuracy: 0.1094\n", + "2023-02-01 11:22:13,692 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 57/15625 train_loss: 2.3034, accuracy: 0.1094\n", + "2023-02-01 11:22:13,794 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 58/15625 train_loss: 2.2955, accuracy: 0.1250\n", + "2023-02-01 11:22:13,892 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 59/15625 train_loss: 2.2970, accuracy: 0.1250\n", + "2023-02-01 11:22:13,992 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 60/15625 train_loss: 2.3020, accuracy: 0.0938\n", + "2023-02-01 11:22:14,095 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 61/15625 train_loss: 2.2882, accuracy: 0.1719\n", + "2023-02-01 11:22:14,172 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 62/15625 train_loss: 2.3037, accuracy: 0.1406\n", + "2023-02-01 11:22:14,248 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 63/15625 train_loss: 2.3068, accuracy: 0.0625\n", + "2023-02-01 11:22:14,372 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 64/15625 train_loss: 2.3007, accuracy: 0.0781\n", + "2023-02-01 11:22:14,452 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 65/15625 train_loss: 2.2938, accuracy: 0.0625\n", + "2023-02-01 11:22:14,532 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 66/15625 train_loss: 2.2961, accuracy: 0.1562\n", + "2023-02-01 11:22:14,609 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 67/15625 train_loss: 2.3021, accuracy: 0.1094\n", + "2023-02-01 11:22:14,717 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 68/15625 train_loss: 2.3022, accuracy: 0.1250\n", + "2023-02-01 11:22:14,792 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 69/15625 train_loss: 2.3062, accuracy: 0.0781\n", + "2023-02-01 11:22:14,889 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 70/15625 train_loss: 2.3065, accuracy: 0.0781\n", + "2023-02-01 11:22:14,966 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 71/15625 train_loss: 2.3020, accuracy: 0.0781\n", + "2023-02-01 11:22:15,044 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 72/15625 train_loss: 2.3042, accuracy: 0.0469\n", + "2023-02-01 11:22:15,138 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 73/15625 train_loss: 2.3018, accuracy: 0.0156\n", + "2023-02-01 11:22:15,237 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 74/15625 train_loss: 2.3041, accuracy: 0.0625\n", + "2023-02-01 11:22:15,337 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 75/15625 train_loss: 2.3067, accuracy: 0.0469\n", + "2023-02-01 11:22:15,435 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 76/15625 train_loss: 2.2976, accuracy: 0.1250\n", + "2023-02-01 11:22:15,519 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 77/15625 train_loss: 2.3122, accuracy: 0.1406\n", + "2023-02-01 11:22:15,613 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 78/15625 train_loss: 2.2983, accuracy: 0.1250\n", + "2023-02-01 11:22:15,701 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 79/15625 train_loss: 2.3053, accuracy: 0.0938\n", + "2023-02-01 11:22:15,794 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 80/15625 train_loss: 2.3010, accuracy: 0.1250\n", + "2023-02-01 11:22:15,889 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 81/15625 train_loss: 2.3150, accuracy: 0.0469\n", + "2023-02-01 11:22:15,971 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 82/15625 train_loss: 2.2924, accuracy: 0.1406\n", + "2023-02-01 11:22:16,076 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 83/15625 train_loss: 2.3046, accuracy: 0.0938\n", + "2023-02-01 11:22:16,169 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 84/15625 train_loss: 2.3020, accuracy: 0.0000\n", + "2023-02-01 11:22:16,261 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 85/15625 train_loss: 2.3053, accuracy: 0.1406\n", + "2023-02-01 11:22:16,361 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 86/15625 train_loss: 2.3055, accuracy: 0.1094\n", + "2023-02-01 11:22:16,471 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 87/15625 train_loss: 2.3040, accuracy: 0.0625\n", + "2023-02-01 11:22:16,571 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 88/15625 train_loss: 2.2987, accuracy: 0.0938\n", + "2023-02-01 11:22:16,683 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 89/15625 train_loss: 2.3049, accuracy: 0.0781\n", + "2023-02-01 11:22:16,764 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 90/15625 train_loss: 2.3101, accuracy: 0.0625\n", + "2023-02-01 11:22:16,872 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 91/15625 train_loss: 2.3018, accuracy: 0.0469\n", + "2023-02-01 11:22:16,969 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 92/15625 train_loss: 2.3042, accuracy: 0.1250\n", + "2023-02-01 11:22:17,046 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 93/15625 train_loss: 2.3018, accuracy: 0.1406\n", + "2023-02-01 11:22:17,149 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 94/15625 train_loss: 2.3073, accuracy: 0.1094\n", + "2023-02-01 11:22:17,228 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 95/15625 train_loss: 2.3033, accuracy: 0.0625\n", + "2023-02-01 11:22:17,327 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 96/15625 train_loss: 2.3001, accuracy: 0.1094\n", + "2023-02-01 11:22:17,403 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 97/15625 train_loss: 2.2952, accuracy: 0.1875\n", + "2023-02-01 11:22:17,482 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 98/15625 train_loss: 2.2964, accuracy: 0.1406\n", + "2023-02-01 11:22:17,582 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 99/15625 train_loss: 2.3014, accuracy: 0.0781\n", + "2023-02-01 11:22:17,681 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 100/15625 train_loss: 2.3060, accuracy: 0.0781\n", + "2023-02-01 11:22:17,785 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 101/15625 train_loss: 2.3032, accuracy: 0.1250\n", + "2023-02-01 11:22:17,885 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 102/15625 train_loss: 2.3023, accuracy: 0.0312\n", + "2023-02-01 11:22:17,961 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 103/15625 train_loss: 2.3069, accuracy: 0.1562\n", + "2023-02-01 11:22:18,063 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 104/15625 train_loss: 2.3017, accuracy: 0.2188\n", + "2023-02-01 11:22:18,141 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 105/15625 train_loss: 2.3066, accuracy: 0.0469\n", + "2023-02-01 11:22:18,226 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 106/15625 train_loss: 2.2929, accuracy: 0.2031\n", + "2023-02-01 11:22:18,311 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 107/15625 train_loss: 2.3073, accuracy: 0.0938\n", + "2023-02-01 11:22:18,399 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 108/15625 train_loss: 2.2993, accuracy: 0.0781\n", + "2023-02-01 11:22:18,483 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 109/15625 train_loss: 2.3017, accuracy: 0.1562\n", + "2023-02-01 11:22:18,566 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 110/15625 train_loss: 2.3028, accuracy: 0.0938\n", + "2023-02-01 11:22:18,649 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 111/15625 train_loss: 2.3011, accuracy: 0.1094\n", + "2023-02-01 11:22:18,749 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 112/15625 train_loss: 2.3078, accuracy: 0.0469\n", + "2023-02-01 11:22:18,832 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 113/15625 train_loss: 2.2930, accuracy: 0.1562\n", + "2023-02-01 11:22:18,935 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 114/15625 train_loss: 2.3011, accuracy: 0.1406\n", + "2023-02-01 11:22:19,038 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 115/15625 train_loss: 2.3009, accuracy: 0.0781\n", + "2023-02-01 11:22:19,132 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 116/15625 train_loss: 2.3050, accuracy: 0.0625\n", + "2023-02-01 11:22:19,223 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 117/15625 train_loss: 2.2997, accuracy: 0.1250\n", + "2023-02-01 11:22:19,311 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 118/15625 train_loss: 2.3054, accuracy: 0.1094\n", + "2023-02-01 11:22:19,328 - Communicator - INFO - Received from simulator_server server (324 Bytes). getTask: __try_again__ time: 0.04789376258850098 seconds\n", + "2023-02-01 11:22:19,329 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-01 11:22:19,401 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 119/15625 train_loss: 2.2952, accuracy: 0.1562\n", + "2023-02-01 11:22:19,499 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 120/15625 train_loss: 2.3067, accuracy: 0.1094\n", + "2023-02-01 11:22:19,609 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 121/15625 train_loss: 2.3085, accuracy: 0.0781\n", + "2023-02-01 11:22:19,702 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 122/15625 train_loss: 2.2970, accuracy: 0.1250\n", + "2023-02-01 11:22:19,784 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 123/15625 train_loss: 2.3181, accuracy: 0.0781\n", + "2023-02-01 11:22:19,876 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 124/15625 train_loss: 2.3091, accuracy: 0.0625\n", + "2023-02-01 11:22:19,981 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 125/15625 train_loss: 2.3071, accuracy: 0.0781\n", + "2023-02-01 11:22:20,082 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 126/15625 train_loss: 2.3018, accuracy: 0.1094\n", + "2023-02-01 11:22:20,180 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 127/15625 train_loss: 2.3071, accuracy: 0.0781\n", + "2023-02-01 11:22:20,284 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 128/15625 train_loss: 2.3054, accuracy: 0.1094\n", + "2023-02-01 11:22:20,374 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 129/15625 train_loss: 2.3061, accuracy: 0.0781\n", + "2023-02-01 11:22:20,457 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 130/15625 train_loss: 2.2973, accuracy: 0.1406\n", + "2023-02-01 11:22:20,540 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 131/15625 train_loss: 2.2987, accuracy: 0.1094\n", + "2023-02-01 11:22:20,649 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 132/15625 train_loss: 2.3023, accuracy: 0.0938\n", + "2023-02-01 11:22:20,742 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 133/15625 train_loss: 2.3058, accuracy: 0.0625\n", + "2023-02-01 11:22:20,851 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 134/15625 train_loss: 2.3002, accuracy: 0.1562\n", + "2023-02-01 11:22:20,943 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 135/15625 train_loss: 2.3069, accuracy: 0.0625\n", + "2023-02-01 11:22:21,046 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 136/15625 train_loss: 2.3056, accuracy: 0.1250\n", + "2023-02-01 11:22:21,141 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 137/15625 train_loss: 2.3036, accuracy: 0.0938\n", + "2023-02-01 11:22:21,236 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 138/15625 train_loss: 2.3015, accuracy: 0.1875\n", + "2023-02-01 11:22:21,337 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 139/15625 train_loss: 2.3031, accuracy: 0.1094\n", + "2023-02-01 11:22:21,439 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 140/15625 train_loss: 2.2963, accuracy: 0.1562\n", + "2023-02-01 11:22:21,535 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 141/15625 train_loss: 2.3019, accuracy: 0.1719\n", + "2023-02-01 11:22:21,628 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 142/15625 train_loss: 2.2987, accuracy: 0.1094\n", + "2023-02-01 11:22:21,713 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 143/15625 train_loss: 2.3062, accuracy: 0.0781\n", + "2023-02-01 11:22:21,818 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 144/15625 train_loss: 2.2983, accuracy: 0.1250\n", + "2023-02-01 11:22:21,917 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 145/15625 train_loss: 2.3010, accuracy: 0.0781\n", + "2023-02-01 11:22:22,000 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 146/15625 train_loss: 2.3068, accuracy: 0.0938\n", + "2023-02-01 11:22:22,095 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 147/15625 train_loss: 2.2978, accuracy: 0.1562\n", + "2023-02-01 11:22:22,187 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 148/15625 train_loss: 2.2977, accuracy: 0.0938\n", + "2023-02-01 11:22:22,280 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 149/15625 train_loss: 2.3014, accuracy: 0.0781\n", + "2023-02-01 11:22:22,367 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 150/15625 train_loss: 2.3081, accuracy: 0.0781\n", + "2023-02-01 11:22:22,453 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 151/15625 train_loss: 2.2975, accuracy: 0.1250\n", + "2023-02-01 11:22:22,553 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 152/15625 train_loss: 2.2952, accuracy: 0.1406\n", + "2023-02-01 11:22:22,654 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 153/15625 train_loss: 2.2929, accuracy: 0.1562\n", + "2023-02-01 11:22:22,745 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 154/15625 train_loss: 2.3033, accuracy: 0.0781\n", + "2023-02-01 11:22:22,834 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 155/15625 train_loss: 2.3038, accuracy: 0.1094\n", + "2023-02-01 11:22:22,921 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 156/15625 train_loss: 2.3046, accuracy: 0.0469\n", + "2023-02-01 11:22:23,011 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 157/15625 train_loss: 2.3030, accuracy: 0.0625\n", + "2023-02-01 11:22:23,110 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 158/15625 train_loss: 2.2996, accuracy: 0.1094\n", + "2023-02-01 11:22:23,203 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 159/15625 train_loss: 2.2970, accuracy: 0.1250\n", + "2023-02-01 11:22:23,293 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 160/15625 train_loss: 2.3065, accuracy: 0.0781\n", + "2023-02-01 11:22:23,381 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 161/15625 train_loss: 2.3009, accuracy: 0.1250\n", + "2023-02-01 11:22:23,477 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 162/15625 train_loss: 2.3045, accuracy: 0.1094\n", + "2023-02-01 11:22:23,563 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 163/15625 train_loss: 2.3043, accuracy: 0.0938\n", + "2023-02-01 11:22:23,650 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 164/15625 train_loss: 2.3040, accuracy: 0.0781\n", + "2023-02-01 11:22:23,732 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 165/15625 train_loss: 2.2934, accuracy: 0.1250\n", + "2023-02-01 11:22:23,842 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 166/15625 train_loss: 2.3110, accuracy: 0.0781\n", + "2023-02-01 11:22:23,923 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 167/15625 train_loss: 2.2961, accuracy: 0.1250\n", + "2023-02-01 11:22:24,024 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 168/15625 train_loss: 2.3064, accuracy: 0.1406\n", + "2023-02-01 11:22:24,109 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 169/15625 train_loss: 2.3006, accuracy: 0.1406\n", + "2023-02-01 11:22:24,210 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 170/15625 train_loss: 2.2940, accuracy: 0.1562\n", + "2023-02-01 11:22:24,314 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 171/15625 train_loss: 2.3051, accuracy: 0.0625\n", + "2023-02-01 11:22:24,411 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 172/15625 train_loss: 2.3010, accuracy: 0.1406\n", + "2023-02-01 11:22:24,514 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 173/15625 train_loss: 2.2952, accuracy: 0.1250\n", + "2023-02-01 11:22:24,611 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 174/15625 train_loss: 2.2911, accuracy: 0.1719\n", + "2023-02-01 11:22:24,693 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 175/15625 train_loss: 2.2912, accuracy: 0.1875\n", + "2023-02-01 11:22:24,784 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 176/15625 train_loss: 2.2949, accuracy: 0.1094\n", + "2023-02-01 11:22:24,867 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 177/15625 train_loss: 2.3014, accuracy: 0.1250\n", + "2023-02-01 11:22:24,957 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 178/15625 train_loss: 2.2996, accuracy: 0.0625\n", + "2023-02-01 11:22:25,052 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 179/15625 train_loss: 2.3001, accuracy: 0.0781\n", + "2023-02-01 11:22:25,152 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 180/15625 train_loss: 2.2922, accuracy: 0.1406\n", + "2023-02-01 11:22:25,238 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 181/15625 train_loss: 2.2993, accuracy: 0.1094\n", + "2023-02-01 11:22:25,332 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 182/15625 train_loss: 2.3014, accuracy: 0.1406\n", + "2023-02-01 11:22:25,421 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 183/15625 train_loss: 2.3001, accuracy: 0.0469\n", + "2023-02-01 11:22:25,517 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 184/15625 train_loss: 2.3116, accuracy: 0.0781\n", + "2023-02-01 11:22:25,618 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 185/15625 train_loss: 2.3049, accuracy: 0.1250\n", + "2023-02-01 11:22:25,708 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 186/15625 train_loss: 2.3094, accuracy: 0.0938\n", + "2023-02-01 11:22:25,798 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 187/15625 train_loss: 2.2993, accuracy: 0.1562\n", + "2023-02-01 11:22:25,917 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 188/15625 train_loss: 2.2967, accuracy: 0.1250\n", + "2023-02-01 11:22:26,019 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 189/15625 train_loss: 2.3031, accuracy: 0.0469\n", + "2023-02-01 11:22:26,128 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 190/15625 train_loss: 2.2907, accuracy: 0.1250\n", + "2023-02-01 11:22:26,225 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 191/15625 train_loss: 2.2981, accuracy: 0.0781\n", + "2023-02-01 11:22:26,329 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 192/15625 train_loss: 2.2911, accuracy: 0.0938\n", + "2023-02-01 11:22:26,386 - Communicator - INFO - Received from simulator_server server (324 Bytes). getTask: __try_again__ time: 0.048476457595825195 seconds\n", + "2023-02-01 11:22:26,387 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-01 11:22:26,440 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 193/15625 train_loss: 2.3040, accuracy: 0.1250\n", + "2023-02-01 11:22:26,532 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 194/15625 train_loss: 2.3048, accuracy: 0.0469\n", + "2023-02-01 11:22:26,629 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 195/15625 train_loss: 2.3074, accuracy: 0.0938\n", + "2023-02-01 11:22:26,717 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 196/15625 train_loss: 2.3039, accuracy: 0.0312\n", + "2023-02-01 11:22:26,822 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 197/15625 train_loss: 2.3049, accuracy: 0.0469\n", + "2023-02-01 11:22:26,927 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 198/15625 train_loss: 2.3038, accuracy: 0.0625\n", + "2023-02-01 11:22:27,039 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 199/15625 train_loss: 2.2979, accuracy: 0.0938\n", + "2023-02-01 11:22:27,140 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 200/15625 train_loss: 2.2911, accuracy: 0.0938\n", + "2023-02-01 11:22:27,251 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 201/15625 train_loss: 2.3148, accuracy: 0.0625\n", + "2023-02-01 11:22:27,343 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 202/15625 train_loss: 2.3030, accuracy: 0.0938\n", + "2023-02-01 11:22:27,446 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 203/15625 train_loss: 2.3068, accuracy: 0.0625\n", + "2023-02-01 11:22:27,548 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 204/15625 train_loss: 2.3046, accuracy: 0.0938\n", + "2023-02-01 11:22:27,645 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 205/15625 train_loss: 2.2976, accuracy: 0.0938\n", + "2023-02-01 11:22:27,742 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 206/15625 train_loss: 2.2878, accuracy: 0.1562\n", + "2023-02-01 11:22:27,840 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 207/15625 train_loss: 2.3047, accuracy: 0.0781\n", + "2023-02-01 11:22:27,926 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 208/15625 train_loss: 2.3088, accuracy: 0.0469\n", + "2023-02-01 11:22:28,019 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 209/15625 train_loss: 2.2965, accuracy: 0.0625\n", + "2023-02-01 11:22:28,114 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 210/15625 train_loss: 2.2894, accuracy: 0.1406\n", + "2023-02-01 11:22:28,201 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 211/15625 train_loss: 2.2908, accuracy: 0.0938\n", + "2023-02-01 11:22:28,285 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 212/15625 train_loss: 2.2945, accuracy: 0.0781\n", + "2023-02-01 11:22:28,393 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 213/15625 train_loss: 2.3048, accuracy: 0.0938\n", + "2023-02-01 11:22:28,486 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 214/15625 train_loss: 2.2917, accuracy: 0.0938\n", + "2023-02-01 11:22:28,583 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 215/15625 train_loss: 2.3052, accuracy: 0.0469\n", + "2023-02-01 11:22:28,685 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 216/15625 train_loss: 2.3082, accuracy: 0.1094\n", + "2023-02-01 11:22:28,782 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 217/15625 train_loss: 2.3094, accuracy: 0.0938\n", + "2023-02-01 11:22:28,869 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 218/15625 train_loss: 2.2965, accuracy: 0.1562\n", + "2023-02-01 11:22:28,978 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 219/15625 train_loss: 2.2979, accuracy: 0.1094\n", + "2023-02-01 11:22:29,059 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 220/15625 train_loss: 2.3069, accuracy: 0.0781\n", + "2023-02-01 11:22:29,165 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 221/15625 train_loss: 2.3024, accuracy: 0.1719\n", + "2023-02-01 11:22:29,250 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 222/15625 train_loss: 2.2991, accuracy: 0.1250\n", + "2023-02-01 11:22:29,338 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 223/15625 train_loss: 2.2988, accuracy: 0.1250\n", + "2023-02-01 11:22:29,449 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 224/15625 train_loss: 2.3018, accuracy: 0.0469\n", + "2023-02-01 11:22:29,545 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 225/15625 train_loss: 2.3011, accuracy: 0.0625\n", + "2023-02-01 11:22:29,649 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 226/15625 train_loss: 2.3025, accuracy: 0.1562\n", + "2023-02-01 11:22:29,751 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 227/15625 train_loss: 2.2967, accuracy: 0.1250\n", + "2023-02-01 11:22:29,857 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 228/15625 train_loss: 2.3038, accuracy: 0.1719\n", + "2023-02-01 11:22:29,946 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 229/15625 train_loss: 2.2979, accuracy: 0.1250\n", + "2023-02-01 11:22:30,037 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 230/15625 train_loss: 2.3036, accuracy: 0.1406\n", + "2023-02-01 11:22:30,125 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 231/15625 train_loss: 2.3031, accuracy: 0.1250\n", + "2023-02-01 11:22:30,219 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 232/15625 train_loss: 2.2992, accuracy: 0.1406\n", + "2023-02-01 11:22:30,320 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 233/15625 train_loss: 2.2985, accuracy: 0.1094\n", + "2023-02-01 11:22:30,412 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 234/15625 train_loss: 2.3027, accuracy: 0.1406\n", + "2023-02-01 11:22:30,514 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 235/15625 train_loss: 2.3086, accuracy: 0.1094\n", + "2023-02-01 11:22:30,609 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 236/15625 train_loss: 2.3057, accuracy: 0.1094\n", + "2023-02-01 11:22:30,695 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 237/15625 train_loss: 2.3119, accuracy: 0.0625\n", + "2023-02-01 11:22:30,787 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 238/15625 train_loss: 2.2908, accuracy: 0.1250\n", + "2023-02-01 11:22:30,871 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 239/15625 train_loss: 2.2955, accuracy: 0.1562\n", + "2023-02-01 11:22:30,967 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 240/15625 train_loss: 2.3018, accuracy: 0.0938\n", + "2023-02-01 11:22:31,052 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 241/15625 train_loss: 2.2947, accuracy: 0.1562\n", + "2023-02-01 11:22:31,174 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 242/15625 train_loss: 2.2982, accuracy: 0.1250\n", + "2023-02-01 11:22:31,281 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 243/15625 train_loss: 2.3009, accuracy: 0.1094\n", + "2023-02-01 11:22:31,372 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 244/15625 train_loss: 2.3208, accuracy: 0.0312\n", + "2023-02-01 11:22:31,470 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 245/15625 train_loss: 2.2961, accuracy: 0.1094\n", + "2023-02-01 11:22:31,576 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 246/15625 train_loss: 2.2974, accuracy: 0.1094\n", + "2023-02-01 11:22:31,684 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 247/15625 train_loss: 2.3192, accuracy: 0.0625\n", + "2023-02-01 11:22:31,793 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 248/15625 train_loss: 2.2932, accuracy: 0.1094\n", + "2023-02-01 11:22:31,887 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 249/15625 train_loss: 2.3041, accuracy: 0.1250\n", + "2023-02-01 11:22:31,977 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 250/15625 train_loss: 2.3056, accuracy: 0.0781\n", + "2023-02-01 11:22:32,080 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 251/15625 train_loss: 2.3196, accuracy: 0.0312\n", + "2023-02-01 11:22:32,193 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 252/15625 train_loss: 2.3008, accuracy: 0.1094\n", + "2023-02-01 11:22:32,310 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 253/15625 train_loss: 2.3037, accuracy: 0.1094\n", + "2023-02-01 11:22:32,420 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 254/15625 train_loss: 2.2943, accuracy: 0.1562\n", + "2023-02-01 11:22:32,503 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 255/15625 train_loss: 2.2884, accuracy: 0.1875\n", + "2023-02-01 11:22:32,588 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 256/15625 train_loss: 2.3017, accuracy: 0.0781\n", + "2023-02-01 11:22:32,688 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 257/15625 train_loss: 2.3020, accuracy: 0.0781\n", + "2023-02-01 11:22:32,773 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 258/15625 train_loss: 2.2995, accuracy: 0.0938\n", + "2023-02-01 11:22:32,869 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 259/15625 train_loss: 2.3068, accuracy: 0.0625\n", + "2023-02-01 11:22:32,957 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 260/15625 train_loss: 2.2999, accuracy: 0.0781\n", + "2023-02-01 11:22:33,055 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 261/15625 train_loss: 2.3026, accuracy: 0.0781\n", + "2023-02-01 11:22:33,155 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 262/15625 train_loss: 2.2977, accuracy: 0.0781\n", + "2023-02-01 11:22:33,266 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 263/15625 train_loss: 2.2991, accuracy: 0.0938\n", + "2023-02-01 11:22:33,362 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 264/15625 train_loss: 2.3050, accuracy: 0.1094\n", + "2023-02-01 11:22:33,448 - Communicator - INFO - Received from simulator_server server (324 Bytes). getTask: __try_again__ time: 0.05476045608520508 seconds\n", + "2023-02-01 11:22:33,449 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-01 11:22:33,450 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 265/15625 train_loss: 2.3089, accuracy: 0.0938\n", + "2023-02-01 11:22:33,550 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 266/15625 train_loss: 2.3001, accuracy: 0.0781\n", + "2023-02-01 11:22:33,659 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 267/15625 train_loss: 2.3036, accuracy: 0.0938\n", + "2023-02-01 11:22:33,770 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 268/15625 train_loss: 2.2984, accuracy: 0.0469\n", + "2023-02-01 11:22:33,860 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 269/15625 train_loss: 2.3065, accuracy: 0.0312\n", + "2023-02-01 11:22:33,950 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 270/15625 train_loss: 2.3046, accuracy: 0.1719\n", + "2023-02-01 11:22:34,047 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 271/15625 train_loss: 2.2991, accuracy: 0.1562\n", + "2023-02-01 11:22:34,132 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 272/15625 train_loss: 2.2932, accuracy: 0.2500\n", + "2023-02-01 11:22:34,241 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 273/15625 train_loss: 2.3120, accuracy: 0.0312\n", + "2023-02-01 11:22:34,328 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 274/15625 train_loss: 2.3126, accuracy: 0.0469\n", + "2023-02-01 11:22:34,420 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 275/15625 train_loss: 2.3002, accuracy: 0.1250\n", + "2023-02-01 11:22:34,516 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 276/15625 train_loss: 2.3115, accuracy: 0.0312\n", + "2023-02-01 11:22:34,609 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 277/15625 train_loss: 2.2926, accuracy: 0.0938\n", + "2023-02-01 11:22:34,706 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 278/15625 train_loss: 2.3064, accuracy: 0.0625\n", + "2023-02-01 11:22:34,793 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 279/15625 train_loss: 2.2957, accuracy: 0.1250\n", + "2023-02-01 11:22:34,889 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 280/15625 train_loss: 2.2942, accuracy: 0.1406\n", + "2023-02-01 11:22:34,982 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 281/15625 train_loss: 2.3010, accuracy: 0.1406\n", + "2023-02-01 11:22:35,073 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 282/15625 train_loss: 2.3024, accuracy: 0.0781\n", + "2023-02-01 11:22:35,154 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 283/15625 train_loss: 2.2994, accuracy: 0.0469\n", + "2023-02-01 11:22:35,261 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 284/15625 train_loss: 2.3101, accuracy: 0.0469\n", + "2023-02-01 11:22:35,366 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 285/15625 train_loss: 2.2952, accuracy: 0.1562\n", + "2023-02-01 11:22:35,480 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 286/15625 train_loss: 2.3002, accuracy: 0.0938\n", + "2023-02-01 11:22:35,603 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 287/15625 train_loss: 2.3039, accuracy: 0.0625\n", + "2023-02-01 11:22:35,711 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 288/15625 train_loss: 2.3008, accuracy: 0.0938\n", + "2023-02-01 11:22:35,807 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 289/15625 train_loss: 2.2964, accuracy: 0.1250\n", + "2023-02-01 11:22:35,914 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 290/15625 train_loss: 2.3021, accuracy: 0.0938\n", + "2023-02-01 11:22:36,008 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 291/15625 train_loss: 2.2940, accuracy: 0.1562\n", + "2023-02-01 11:22:36,115 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 292/15625 train_loss: 2.3018, accuracy: 0.0312\n", + "2023-02-01 11:22:36,218 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 293/15625 train_loss: 2.3011, accuracy: 0.0938\n", + "2023-02-01 11:22:36,328 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 294/15625 train_loss: 2.3021, accuracy: 0.1562\n", + "2023-02-01 11:22:36,423 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 295/15625 train_loss: 2.3057, accuracy: 0.0625\n", + "2023-02-01 11:22:36,510 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 296/15625 train_loss: 2.2943, accuracy: 0.0938\n", + "2023-02-01 11:22:36,607 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 297/15625 train_loss: 2.2994, accuracy: 0.0625\n", + "2023-02-01 11:22:36,696 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 298/15625 train_loss: 2.2951, accuracy: 0.1094\n", + "2023-02-01 11:22:36,799 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 299/15625 train_loss: 2.2988, accuracy: 0.0938\n", + "2023-02-01 11:22:36,911 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 300/15625 train_loss: 2.3063, accuracy: 0.1406\n", + "2023-02-01 11:22:36,996 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 301/15625 train_loss: 2.3009, accuracy: 0.0938\n", + "2023-02-01 11:22:37,080 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 302/15625 train_loss: 2.2960, accuracy: 0.0938\n", + "2023-02-01 11:22:37,165 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 303/15625 train_loss: 2.2929, accuracy: 0.1250\n", + "2023-02-01 11:22:37,255 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 304/15625 train_loss: 2.2956, accuracy: 0.1875\n", + "2023-02-01 11:22:37,351 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 305/15625 train_loss: 2.2897, accuracy: 0.1406\n", + "2023-02-01 11:22:37,455 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 306/15625 train_loss: 2.3051, accuracy: 0.0469\n", + "2023-02-01 11:22:37,549 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 307/15625 train_loss: 2.2930, accuracy: 0.1406\n", + "2023-02-01 11:22:37,653 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 308/15625 train_loss: 2.2982, accuracy: 0.1406\n", + "2023-02-01 11:22:37,759 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 309/15625 train_loss: 2.2986, accuracy: 0.0938\n", + "2023-02-01 11:22:37,879 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 310/15625 train_loss: 2.2965, accuracy: 0.1562\n", + "2023-02-01 11:22:37,964 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 311/15625 train_loss: 2.2961, accuracy: 0.0781\n", + "2023-02-01 11:22:38,060 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 312/15625 train_loss: 2.2933, accuracy: 0.1094\n", + "2023-02-01 11:22:38,145 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 313/15625 train_loss: 2.2905, accuracy: 0.1875\n", + "2023-02-01 11:22:38,250 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 314/15625 train_loss: 2.2863, accuracy: 0.2656\n", + "2023-02-01 11:22:38,349 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 315/15625 train_loss: 2.2966, accuracy: 0.1406\n", + "2023-02-01 11:22:38,452 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 316/15625 train_loss: 2.2949, accuracy: 0.1094\n", + "2023-02-01 11:22:38,542 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 317/15625 train_loss: 2.3025, accuracy: 0.0938\n", + "2023-02-01 11:22:38,632 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 318/15625 train_loss: 2.2962, accuracy: 0.1094\n", + "2023-02-01 11:22:38,716 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 319/15625 train_loss: 2.2922, accuracy: 0.1406\n", + "2023-02-01 11:22:38,813 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 320/15625 train_loss: 2.3026, accuracy: 0.1406\n", + "2023-02-01 11:22:38,916 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 321/15625 train_loss: 2.2917, accuracy: 0.1094\n", + "2023-02-01 11:22:39,015 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 322/15625 train_loss: 2.2984, accuracy: 0.0781\n", + "2023-02-01 11:22:39,105 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 323/15625 train_loss: 2.2996, accuracy: 0.1406\n", + "2023-02-01 11:22:39,191 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 324/15625 train_loss: 2.2930, accuracy: 0.1250\n", + "2023-02-01 11:22:39,290 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 325/15625 train_loss: 2.2954, accuracy: 0.0781\n", + "2023-02-01 11:22:39,400 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 326/15625 train_loss: 2.2976, accuracy: 0.1875\n", + "2023-02-01 11:22:39,514 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 327/15625 train_loss: 2.3002, accuracy: 0.1875\n", + "2023-02-01 11:22:39,604 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 328/15625 train_loss: 2.2902, accuracy: 0.2656\n", + "2023-02-01 11:22:39,712 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 329/15625 train_loss: 2.2900, accuracy: 0.1094\n", + "2023-02-01 11:22:39,800 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 330/15625 train_loss: 2.2937, accuracy: 0.1250\n", + "2023-02-01 11:22:39,895 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 331/15625 train_loss: 2.2876, accuracy: 0.2188\n", + "2023-02-01 11:22:39,994 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 332/15625 train_loss: 2.2933, accuracy: 0.1094\n", + "2023-02-01 11:22:40,104 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 333/15625 train_loss: 2.2923, accuracy: 0.1094\n", + "2023-02-01 11:22:40,214 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 334/15625 train_loss: 2.2984, accuracy: 0.0625\n", + "2023-02-01 11:22:40,304 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 335/15625 train_loss: 2.2898, accuracy: 0.1406\n", + "2023-02-01 11:22:40,416 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 336/15625 train_loss: 2.2847, accuracy: 0.2188\n", + "2023-02-01 11:22:40,507 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 337/15625 train_loss: 2.2935, accuracy: 0.1250\n", + "2023-02-01 11:22:40,512 - Communicator - INFO - Received from simulator_server server (324 Bytes). getTask: __try_again__ time: 0.05568242073059082 seconds\n", + "2023-02-01 11:22:40,513 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-01 11:22:40,619 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 338/15625 train_loss: 2.2870, accuracy: 0.1250\n", + "2023-02-01 11:22:40,730 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 339/15625 train_loss: 2.2853, accuracy: 0.2188\n", + "2023-02-01 11:22:40,826 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 340/15625 train_loss: 2.2861, accuracy: 0.1719\n", + "2023-02-01 11:22:40,930 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 341/15625 train_loss: 2.2854, accuracy: 0.1094\n", + "2023-02-01 11:22:41,027 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 342/15625 train_loss: 2.2843, accuracy: 0.1719\n", + "2023-02-01 11:22:41,127 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 343/15625 train_loss: 2.2811, accuracy: 0.2031\n", + "2023-02-01 11:22:41,232 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 344/15625 train_loss: 2.2789, accuracy: 0.1406\n", + "2023-02-01 11:22:41,314 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 345/15625 train_loss: 2.2864, accuracy: 0.1719\n", + "2023-02-01 11:22:41,412 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 346/15625 train_loss: 2.2667, accuracy: 0.1719\n", + "2023-02-01 11:22:41,515 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 347/15625 train_loss: 2.2855, accuracy: 0.0781\n", + "2023-02-01 11:22:41,616 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 348/15625 train_loss: 2.2940, accuracy: 0.1250\n", + "2023-02-01 11:22:41,726 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 349/15625 train_loss: 2.2833, accuracy: 0.1562\n", + "2023-02-01 11:22:41,819 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 350/15625 train_loss: 2.2660, accuracy: 0.1562\n", + "2023-02-01 11:22:41,916 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 351/15625 train_loss: 2.2689, accuracy: 0.1875\n", + "2023-02-01 11:22:42,033 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 352/15625 train_loss: 2.2516, accuracy: 0.1562\n", + "2023-02-01 11:22:42,143 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 353/15625 train_loss: 2.2737, accuracy: 0.0781\n", + "2023-02-01 11:22:42,234 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 354/15625 train_loss: 2.2570, accuracy: 0.0781\n", + "2023-02-01 11:22:42,349 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 355/15625 train_loss: 2.2873, accuracy: 0.1406\n", + "2023-02-01 11:22:42,446 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 356/15625 train_loss: 2.2452, accuracy: 0.1719\n", + "2023-02-01 11:22:42,551 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 357/15625 train_loss: 2.2700, accuracy: 0.0625\n", + "2023-02-01 11:22:42,662 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 358/15625 train_loss: 2.2630, accuracy: 0.0938\n", + "2023-02-01 11:22:42,781 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 359/15625 train_loss: 2.2451, accuracy: 0.1250\n", + "2023-02-01 11:22:42,889 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 360/15625 train_loss: 2.3013, accuracy: 0.1406\n", + "2023-02-01 11:22:42,981 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 361/15625 train_loss: 2.2512, accuracy: 0.1250\n", + "2023-02-01 11:22:43,087 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 362/15625 train_loss: 2.2277, accuracy: 0.1562\n", + "2023-02-01 11:22:43,180 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 363/15625 train_loss: 2.2082, accuracy: 0.1250\n", + "2023-02-01 11:22:43,268 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 364/15625 train_loss: 2.2943, accuracy: 0.1406\n", + "2023-02-01 11:22:43,363 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 365/15625 train_loss: 2.2502, accuracy: 0.2031\n", + "2023-02-01 11:22:43,450 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 366/15625 train_loss: 2.2170, accuracy: 0.1719\n", + "2023-02-01 11:22:43,535 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 367/15625 train_loss: 2.2028, accuracy: 0.1719\n", + "2023-02-01 11:22:43,629 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 368/15625 train_loss: 2.3283, accuracy: 0.0781\n", + "2023-02-01 11:22:43,730 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 369/15625 train_loss: 2.2599, accuracy: 0.1250\n", + "2023-02-01 11:22:43,831 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 370/15625 train_loss: 2.2582, accuracy: 0.1250\n", + "2023-02-01 11:22:43,924 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 371/15625 train_loss: 2.2251, accuracy: 0.1562\n", + "2023-02-01 11:22:44,016 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 372/15625 train_loss: 2.2318, accuracy: 0.2188\n", + "2023-02-01 11:22:44,101 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 373/15625 train_loss: 2.2257, accuracy: 0.2188\n", + "2023-02-01 11:22:44,200 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 374/15625 train_loss: 2.2796, accuracy: 0.2188\n", + "2023-02-01 11:22:44,289 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 375/15625 train_loss: 2.1668, accuracy: 0.2656\n", + "2023-02-01 11:22:44,395 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 376/15625 train_loss: 2.2031, accuracy: 0.2500\n", + "2023-02-01 11:22:44,507 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 377/15625 train_loss: 2.1840, accuracy: 0.1875\n", + "2023-02-01 11:22:44,605 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 378/15625 train_loss: 2.2301, accuracy: 0.2031\n", + "2023-02-01 11:22:44,704 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 379/15625 train_loss: 2.2044, accuracy: 0.1875\n", + "2023-02-01 11:22:44,810 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 380/15625 train_loss: 2.2564, accuracy: 0.1562\n", + "2023-02-01 11:22:44,922 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 381/15625 train_loss: 2.1952, accuracy: 0.1250\n", + "2023-02-01 11:22:45,033 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 382/15625 train_loss: 2.1455, accuracy: 0.2500\n", + "2023-02-01 11:22:45,133 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 383/15625 train_loss: 2.1545, accuracy: 0.2031\n", + "2023-02-01 11:22:45,242 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 384/15625 train_loss: 2.1505, accuracy: 0.1250\n", + "2023-02-01 11:22:45,343 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 385/15625 train_loss: 2.2310, accuracy: 0.1562\n", + "2023-02-01 11:22:45,435 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 386/15625 train_loss: 2.1123, accuracy: 0.2188\n", + "2023-02-01 11:22:45,522 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 387/15625 train_loss: 2.2088, accuracy: 0.2344\n", + "2023-02-01 11:22:45,635 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 388/15625 train_loss: 2.1007, accuracy: 0.2656\n", + "2023-02-01 11:22:45,738 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 389/15625 train_loss: 2.1124, accuracy: 0.3125\n", + "2023-02-01 11:22:45,841 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 390/15625 train_loss: 2.2503, accuracy: 0.1875\n", + "2023-02-01 11:22:45,938 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 391/15625 train_loss: 2.2184, accuracy: 0.1875\n", + "2023-02-01 11:22:46,054 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 392/15625 train_loss: 2.1410, accuracy: 0.1719\n", + "2023-02-01 11:22:46,157 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 393/15625 train_loss: 2.1365, accuracy: 0.2188\n", + "2023-02-01 11:22:46,269 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 394/15625 train_loss: 2.1312, accuracy: 0.2500\n", + "2023-02-01 11:22:46,383 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 395/15625 train_loss: 2.1945, accuracy: 0.2188\n", + "2023-02-01 11:22:46,470 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 396/15625 train_loss: 2.2152, accuracy: 0.2031\n", + "2023-02-01 11:22:46,557 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 397/15625 train_loss: 2.1915, accuracy: 0.2031\n", + "2023-02-01 11:22:46,655 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 398/15625 train_loss: 2.0181, accuracy: 0.2344\n", + "2023-02-01 11:22:46,761 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 399/15625 train_loss: 2.1406, accuracy: 0.2344\n", + "2023-02-01 11:22:46,875 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 400/15625 train_loss: 2.2516, accuracy: 0.1875\n", + "2023-02-01 11:22:46,980 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 401/15625 train_loss: 1.9922, accuracy: 0.2656\n", + "2023-02-01 11:22:47,089 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 402/15625 train_loss: 2.1383, accuracy: 0.1719\n", + "2023-02-01 11:22:47,175 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 403/15625 train_loss: 2.2437, accuracy: 0.0938\n", + "2023-02-01 11:22:47,272 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 404/15625 train_loss: 2.0606, accuracy: 0.3125\n", + "2023-02-01 11:22:47,360 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 405/15625 train_loss: 2.1023, accuracy: 0.3125\n", + "2023-02-01 11:22:47,449 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 406/15625 train_loss: 2.1316, accuracy: 0.2031\n", + "2023-02-01 11:22:47,559 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 407/15625 train_loss: 2.1691, accuracy: 0.2656\n", + "2023-02-01 11:22:47,576 - Communicator - INFO - Received from simulator_server server (324 Bytes). getTask: __try_again__ time: 0.05474042892456055 seconds\n", + "2023-02-01 11:22:47,577 - ClientTaskWorker - INFO - Finished one task run for client: site-2\n", + "2023-02-01 11:22:47,658 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 408/15625 train_loss: 2.1557, accuracy: 0.2656\n", + "2023-02-01 11:22:47,768 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 409/15625 train_loss: 2.1040, accuracy: 0.2188\n", + "2023-02-01 11:22:47,883 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 410/15625 train_loss: 2.1282, accuracy: 0.2812\n", + "2023-02-01 11:22:47,991 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 411/15625 train_loss: 2.2149, accuracy: 0.1875\n", + "2023-02-01 11:22:48,088 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 412/15625 train_loss: 2.0835, accuracy: 0.1875\n", + "2023-02-01 11:22:48,176 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 413/15625 train_loss: 2.0754, accuracy: 0.2812\n", + "2023-02-01 11:22:48,262 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 414/15625 train_loss: 2.0057, accuracy: 0.2812\n", + "2023-02-01 11:22:48,361 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 415/15625 train_loss: 2.1142, accuracy: 0.1094\n", + "2023-02-01 11:22:48,449 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 416/15625 train_loss: 1.9668, accuracy: 0.2188\n", + "2023-02-01 11:22:48,549 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 417/15625 train_loss: 2.0318, accuracy: 0.2500\n", + "2023-02-01 11:22:48,639 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 418/15625 train_loss: 2.0615, accuracy: 0.2812\n", + "2023-02-01 11:22:48,734 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 419/15625 train_loss: 2.1931, accuracy: 0.1875\n", + "2023-02-01 11:22:48,825 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 420/15625 train_loss: 2.0252, accuracy: 0.2500\n", + "2023-02-01 11:22:48,923 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 421/15625 train_loss: 2.0422, accuracy: 0.2031\n", + "2023-02-01 11:22:49,013 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 422/15625 train_loss: 2.0968, accuracy: 0.2031\n", + "2023-02-01 11:22:49,100 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 423/15625 train_loss: 2.0073, accuracy: 0.3594\n", + "2023-02-01 11:22:49,197 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 424/15625 train_loss: 2.1748, accuracy: 0.2656\n", + "2023-02-01 11:22:49,300 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 425/15625 train_loss: 1.8918, accuracy: 0.2969\n", + "2023-02-01 11:22:49,411 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 426/15625 train_loss: 1.9859, accuracy: 0.2188\n", + "2023-02-01 11:22:49,506 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 427/15625 train_loss: 2.0824, accuracy: 0.2188\n", + "2023-02-01 11:22:49,623 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 428/15625 train_loss: 1.9761, accuracy: 0.2969\n", + "2023-02-01 11:22:49,713 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 429/15625 train_loss: 2.1588, accuracy: 0.1875\n", + "2023-02-01 11:22:49,810 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 430/15625 train_loss: 2.1488, accuracy: 0.1562\n", + "2023-02-01 11:22:49,921 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 431/15625 train_loss: 2.1044, accuracy: 0.1875\n", + "2023-02-01 11:22:50,017 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 432/15625 train_loss: 2.0260, accuracy: 0.2812\n", + "2023-02-01 11:22:50,111 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 433/15625 train_loss: 2.0426, accuracy: 0.1562\n", + "2023-02-01 11:22:50,213 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 434/15625 train_loss: 2.1430, accuracy: 0.2812\n", + "2023-02-01 11:22:50,326 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 435/15625 train_loss: 2.1436, accuracy: 0.2031\n", + "2023-02-01 11:22:50,438 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 436/15625 train_loss: 2.1261, accuracy: 0.2031\n", + "2023-02-01 11:22:50,538 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 437/15625 train_loss: 1.9830, accuracy: 0.3750\n", + "2023-02-01 11:22:50,654 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 438/15625 train_loss: 1.9963, accuracy: 0.1562\n", + "2023-02-01 11:22:50,752 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 439/15625 train_loss: 1.9253, accuracy: 0.2500\n", + "2023-02-01 11:22:50,862 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 440/15625 train_loss: 2.0240, accuracy: 0.2188\n", + "2023-02-01 11:22:50,952 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 441/15625 train_loss: 2.0938, accuracy: 0.1875\n", + "2023-02-01 11:22:51,047 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 442/15625 train_loss: 2.0609, accuracy: 0.2500\n", + "2023-02-01 11:22:51,150 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 443/15625 train_loss: 2.0430, accuracy: 0.2344\n", + "2023-02-01 11:22:51,237 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 444/15625 train_loss: 2.0364, accuracy: 0.2500\n", + "2023-02-01 11:22:51,321 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 445/15625 train_loss: 2.0907, accuracy: 0.2031\n", + "2023-02-01 11:22:51,414 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 446/15625 train_loss: 1.9586, accuracy: 0.2031\n", + "2023-02-01 11:22:51,516 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 447/15625 train_loss: 2.1619, accuracy: 0.1719\n", + "2023-02-01 11:22:51,605 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 448/15625 train_loss: 2.0102, accuracy: 0.2656\n", + "2023-02-01 11:22:51,699 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 449/15625 train_loss: 1.9885, accuracy: 0.2188\n", + "2023-02-01 11:22:51,787 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 450/15625 train_loss: 2.0517, accuracy: 0.2500\n", + "2023-02-01 11:22:51,879 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 451/15625 train_loss: 2.0394, accuracy: 0.2188\n", + "2023-02-01 11:22:51,969 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 452/15625 train_loss: 2.0282, accuracy: 0.2031\n", + "2023-02-01 11:22:52,061 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 453/15625 train_loss: 2.2091, accuracy: 0.1250\n", + "2023-02-01 11:22:52,166 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 454/15625 train_loss: 1.8325, accuracy: 0.2969\n", + "2023-02-01 11:22:52,281 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 455/15625 train_loss: 1.9619, accuracy: 0.2500\n", + "2023-02-01 11:22:52,383 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 456/15625 train_loss: 2.1210, accuracy: 0.2031\n", + "2023-02-01 11:22:52,599 - SimulatorRunner - ERROR - Simulator run error: \n", + "2023-02-01 11:22:52,493 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 457/15625 train_loss: 2.0015, accuracy: 0.2656\n", + "2023-02-01 11:22:52,574 - CIFAR10LearnerSplitNN - INFO - [identity=site-2, run=simulate_job]: Round 458/15625 train_loss: 2.0641, accuracy: 0.2031\n", + "Simulator finished with run_status -9\n" + ] + } + ], + "source": [ + "import os\n", + "from nvflare import SimulatorRunner \n", + "\n", + "simulator = SimulatorRunner(\n", + " job_folder=f\"job_configs/cifar10_splitnn\",\n", + " workspace=\"/tmp/nvflare/cifar10_splitnn\",\n", + " n_clients=2,\n", + " threads=2\n", + ")\n", + "run_status = simulator.run()\n", + "print(\"Simulator finished with run_status\", run_status)" + ] + }, + { + "cell_type": "markdown", + "id": "913e9ee2-e993-442d-a525-d2baf92af539", + "metadata": {}, + "source": [ + "The site containing the labels can compute accuracy and losses, which can be visualized in tensorboard." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a6814434-4e6d-4460-b480-709cb3e77cc8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The tensorboard extension is already loaded. To reload it, use:\n", + " %reload_ext tensorboard\n" + ] + }, + { + "data": { + "text/plain": [ + "Reusing TensorBoard on port 6006 (pid 349977), started 0:00:00 ago. (Use '!kill 349977' to kill it.)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Load the TensorBoard notebook extension\n", + "%load_ext tensorboard\n", + "\n", + "%tensorboard --logdir /tmp/nvflare/cifar10_splitnn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc5d8fe1-ba06-4e20-9130-1ec9c13e2454", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/figs/split_learning.svg b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/figs/split_learning.svg new file mode 100644 index 0000000000..8284ea6fce --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/figs/split_learning.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/meta.json b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/meta.json new file mode 100644 index 0000000000..ea2eebcb7a --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/meta.json @@ -0,0 +1,9 @@ +{ + "name": "cifar10_psi", + "deploy_map": { + "server": ["server"], + "site-1": ["site-1"], + "site-2": ["site-2"] + }, + "min_clients": 2 +} diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/server/config/config_fed_server.json b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/server/config/config_fed_server.json new file mode 100644 index 0000000000..9ce4654047 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/server/config/config_fed_server.json @@ -0,0 +1,11 @@ +{ + "format_version": 2, + "workflows": [ + { + "id": "DhPSIController", + "path": "nvflare.app_common.workflows.dh_psi_controller.DhPSIController", + "args": { + } + } + ] +} diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/site-1/config/config_fed_client.json b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/site-1/config/config_fed_client.json new file mode 100644 index 0000000000..15b7ecb803 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/site-1/config/config_fed_client.json @@ -0,0 +1,34 @@ +{ + "format_version": 2, + "executors": [ + { + "tasks": [ + "PSI" + ], + "executor": { + "id": "Executor", + "path": "nvflare.app_opt.psi.psi_executor.PSIExecutor", + "args": { + "local_psi_id": "local_psi" + } + } + } + ], + "components": [ + { + "id": "local_psi", + "path": "psi.cifar10_local_psi.Cifar10LocalPSI", + "args": { + "psi_writer_id": "psi_writer", + "data_path": "/tmp/cifar10_vert_splits/site-1.npy" + } + }, + { + "id": "psi_writer", + "path": "nvflare.app_common.psi.psi_file_writer.FilePsiWriter", + "args": { + "output_path": "psi/intersection.txt" + } + } + ] +} diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/site-2/config/config_fed_client.json b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/site-2/config/config_fed_client.json new file mode 100644 index 0000000000..8c27b13147 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_psi/site-2/config/config_fed_client.json @@ -0,0 +1,34 @@ +{ + "format_version": 2, + "executors": [ + { + "tasks": [ + "PSI" + ], + "executor": { + "id": "Executor", + "path": "nvflare.app_opt.psi.psi_executor.PSIExecutor", + "args": { + "local_psi_id": "local_psi" + } + } + } + ], + "components": [ + { + "id": "local_psi", + "path": "psi.cifar10_local_psi.Cifar10LocalPSI", + "args": { + "psi_writer_id": "psi_writer", + "data_path": "/tmp/cifar10_vert_splits/site-2.npy" + } + }, + { + "id": "psi_writer", + "path": "nvflare.app_common.psi.psi_file_writer.FilePsiWriter", + "args": { + "output_path": "psi/intersection.txt" + } + } + ] +} diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/meta.json b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/meta.json new file mode 100644 index 0000000000..1c52b4a94d --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/meta.json @@ -0,0 +1,9 @@ +{ + "name": "cifar10_splitnn", + "deploy_map": { + "server": ["server"], + "site-1": ["site-1"], + "site-2": ["site-2"] + }, + "min_clients": 2 +} \ No newline at end of file diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/server/config/config_fed_server.json b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/server/config/config_fed_server.json new file mode 100644 index 0000000000..05038c2cea --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/server/config/config_fed_server.json @@ -0,0 +1,47 @@ +{ + "format_version": 2, + + "num_rounds": 15625, + "batch_size": 64, + + "server": { + "heart_beat_timeout": 600 + }, + "task_data_filters": [], + "task_result_filters": [], + "components": [ + { + "id": "persistor", + "path": "nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor", + "args": { + "model": { + "path": "pt.networks.cifar10_nets.ModerateCNN" + } + } + }, + { + "id": "shareable_generator", + "name": "FullModelShareableGenerator", + "args": {} + }, + { + "id": "json_generator", + "name": "ValidationJsonGenerator", + "args": {} + } + ], + "workflows": [ + { + "id": "splitnn_ctl", + "path": "nvflare.app_common.workflows.splitnn_workflow.SplitNNController", + "args": { + "num_rounds" : "{num_rounds}", + "batch_size": "{batch_size}", + "start_round": 0, + "persistor_id": "persistor", + "task_timeout": 0, + "shareable_generator_id": "shareable_generator" + } + } + ] +} diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/site-1/config/config_fed_client.json b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/site-1/config/config_fed_client.json new file mode 100644 index 0000000000..bdb29708e0 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/site-1/config/config_fed_client.json @@ -0,0 +1,40 @@ +{ + "format_version": 2, + + "DATASET_ROOT": "/tmp/cifar10", + "INTERSECTION_FILE": "site-1-intersection.txt", + + "executors": [ + { + "tasks": [ + "_splitnn_task_init_model_", + "_splitnn_task_train_" + ], + "executor": { + "id": "Executor", + "path": "nvflare.app_common.executors.splitnn_learner_executor.SplitNNLearnerExecutor", + "args": { + "learner_id": "cifar10-learner" + } + } + } + ], + + "task_result_filters": [ + ], + "task_data_filters": [ + ], + + "components": [ + { + "id": "cifar10-learner", + "path": "splitnn.cifar10_learner_splitnn.CIFAR10LearnerSplitNN", + "args": { + "dataset_root": "{DATASET_ROOT}", + "intersection_file": "{INTERSECTION_FILE}", + "lr": 1e-2, + "model": {"path": "pt.networks.split_nn.SplitNN", "args": {"split_id": 0}} + } + } + ] +} diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/site-2/config/config_fed_client.json b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/site-2/config/config_fed_client.json new file mode 100644 index 0000000000..136732b95d --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/job_configs/cifar10_splitnn/site-2/config/config_fed_client.json @@ -0,0 +1,40 @@ +{ + "format_version": 2, + + "DATASET_ROOT": "/tmp/cifar10", + "INTERSECTION_FILE": "site-2-intersection.txt", + + "executors": [ + { + "tasks": [ + "_splitnn_task_init_model_", + "_splitnn_task_train_" + ], + "executor": { + "id": "Executor", + "path": "nvflare.app_common.executors.splitnn_learner_executor.SplitNNLearnerExecutor", + "args": { + "learner_id": "cifar10-learner" + } + } + } + ], + + "task_result_filters": [ + ], + "task_data_filters": [ + ], + + "components": [ + { + "id": "cifar10-learner", + "path": "splitnn.cifar10_learner_splitnn.CIFAR10LearnerSplitNN", + "args": { + "dataset_root": "{DATASET_ROOT}", + "intersection_file": "{INTERSECTION_FILE}", + "lr": 1e-2, + "model": {"path": "pt.networks.split_nn.SplitNN", "args": {"split_id": 1}} + } + } + ] +} diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/set_intersection_file.py b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/set_intersection_file.py new file mode 100644 index 0000000000..b6f4852df8 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/set_intersection_file.py @@ -0,0 +1,46 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config_file", + type=str, + default="./config_fed_client.json", + help="config file in JSON format", + ) + parser.add_argument( + "--intersection_file", + type=str, + help="Intersection file with overlapping data indices", + ) + args = parser.parse_args() + + with open(args.config_file, "r") as f: + config = json.load(f) + + config["INTERSECTION_FILE"] = args.intersection_file + + with open(args.config_file, "w") as f: + json.dump(config, f, indent=4) + + print(f"Modified {args.config_file} to use INTERSECTION_FILE={config['INTERSECTION_FILE']}") + + +if __name__ == "__main__": + main() diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/psi/cifar10_local_psi.py b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/psi/cifar10_local_psi.py new file mode 100644 index 0000000000..ea06a0f589 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/psi/cifar10_local_psi.py @@ -0,0 +1,36 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os.path +from typing import List + +import numpy as np + +from nvflare.app_common.psi.psi_spec import PSI + + +class Cifar10LocalPSI(PSI): + def __init__(self, psi_writer_id: str, data_path: str = "/tmp/data.csv"): + super().__init__(psi_writer_id) + self.data_path = data_path + self.data = {} + + if not os.path.isfile(self.data_path): + raise RuntimeError(f"invalid data path {data_path}") + + def load_items(self) -> List[str]: + _ext = os.path.splitext(self.data_path)[1] + + items = np.load(self.data_path) + + return [str(i) for i in items] diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_learner_splitnn.py b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_learner_splitnn.py new file mode 100644 index 0000000000..824519c020 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_learner_splitnn.py @@ -0,0 +1,614 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from timeit import default_timer as timer + +import numpy as np +import torch +import torch.optim as optim +from cifar10_splitnn_dataset import CIFAR10SplitNN +from torch.utils.tensorboard import SummaryWriter +from torchvision import transforms + +from nvflare.apis.dxo import DXO, from_shareable +from nvflare.apis.fl_constant import FLContextKey, ReturnCode +from nvflare.apis.fl_context import FLContext +from nvflare.apis.shareable import Shareable, make_reply +from nvflare.apis.signal import Signal +from nvflare.app_common.abstract.learner_spec import Learner +from nvflare.app_common.app_constant import AppConstants +from nvflare.app_common.workflows.splitnn_workflow import SplitNNConstants, SplitNNDataKind +from nvflare.app_opt.pt.decomposers import TensorDecomposer +from nvflare.fuel.f3.stats_pool import StatsPoolManager +from nvflare.fuel.utils import fobs + + +class CIFAR10LearnerSplitNN(Learner): + def __init__( + self, + dataset_root: str = "./dataset", + intersection_file: str = None, + lr: float = 1e-2, + model: dict = None, + analytic_sender_id: str = "analytic_sender", + fp16: bool = True, + val_freq: int = 1000, + ): + """Simple CIFAR-10 Trainer for split learning. + + Args: + dataset_root: directory with CIFAR-10 data. + intersection_file: Optional. intersection file specifying overlapping indices between both clients. + Defaults to `None`, i.e. the whole training dataset is used. + lr: learning rate. + model: Split learning model. + analytic_sender_id: id of `AnalyticsSender` if configured as a client component. + If configured, TensorBoard events will be fired. Defaults to "analytic_sender". + fp16: If `True`, convert activations and gradients send between clients to `torch.float16`. + Reduces bandwidth needed for communication but might impact model accuracy. + val_freq: how often to perform validation in rounds. Defaults to 1000. No validation if <= 0. + """ + super().__init__() + self.dataset_root = dataset_root + self.intersection_file = intersection_file + self.lr = lr + self.model = model + self.analytic_sender_id = analytic_sender_id + self.fp16 = fp16 + self.val_freq = val_freq + + self.target_names = None + self.app_root = None + self.current_round = None + self.num_rounds = None + self.batch_size = None + self.writer = None + self.client_name = None + self.other_client = None + self.device = None + self.optimizer = None + self.criterion = None + self.transform_train = None + self.transform_valid = None + self.train_dataset = None + self.valid_dataset = None + self.split_id = None + self.train_activations = None + self.train_batch_indices = None + self.train_size = 0 + self.val_loss = [] + self.val_labels = [] + self.val_pred_labels = [] + self.compute_stats_pool = None + + # use FOBS serializing/deserializing PyTorch tensors + fobs.register(TensorDecomposer) + + def _get_model(self, fl_ctx: FLContext): + """Get model from client config. Modelled after `PTFileModelPersistor`.""" + if isinstance(self.model, str): + # treat it as model component ID + model_component_id = self.model + engine = fl_ctx.get_engine() + self.model = engine.get_component(model_component_id) + if not self.model: + self.log_error(fl_ctx, f"cannot find model component '{model_component_id}'") + return + if self.model and isinstance(self.model, dict): + # try building the model + try: + engine = fl_ctx.get_engine() + # use provided or default optimizer arguments and add the model parameters + if "args" not in self.model: + self.model["args"] = {} + self.model = engine.build_component(self.model) + except BaseException as e: + self.system_panic( + f"Exception while parsing `model`: " f"{self.model} with Exception {e}", + fl_ctx, + ) + return + if self.model and not isinstance(self.model, torch.nn.Module): + self.system_panic(fl_ctx, f"expect model to be torch.nn.Module but got {type(self.model)}: {self.model}") + return + if self.model is None: + self.system_panic(fl_ctx, f"Model wasn't built correctly! It is {self.model}") + return + self.log_info(fl_ctx, f"Running model {self.model}") + + def initialize(self, parts: dict, fl_ctx: FLContext): + t_start = timer() + self._get_model(fl_ctx=fl_ctx) + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + self.model = self.model.to(self.device) + self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.9) + self.criterion = torch.nn.CrossEntropyLoss() + + self.transform_train = transforms.Compose( + [ + transforms.ToTensor(), + transforms.ToPILImage(), + transforms.Pad(4, padding_mode="reflect"), + transforms.RandomCrop(32), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize( + mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], + std=[x / 255.0 for x in [63.0, 62.1, 66.7]], + ), + ] + ) + self.transform_valid = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize( + mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], + std=[x / 255.0 for x in [63.0, 62.1, 66.7]], + ), + ] + ) + + self.app_root = fl_ctx.get_prop(FLContextKey.APP_ROOT) + self.client_name = fl_ctx.get_identity_name() + self.split_id = self.model.get_split_id() + self.log_info(fl_ctx, f"Running `split_id` {self.split_id} on site `{self.client_name}`") + + if self.split_id == 0: # data side + data_returns = "image" + elif self.split_id == 1: # label side + data_returns = "label" + else: + raise ValueError(f"Expected split_id to be '0' or '1' but was {self.split_id}") + + if self.intersection_file is not None: + _intersect_indices = np.loadtxt(self.intersection_file) + else: + _intersect_indices = None + self.train_dataset = CIFAR10SplitNN( + root=self.dataset_root, + train=True, + download=True, + transform=self.transform_train, + returns=data_returns, + intersect_idx=_intersect_indices, + ) + + self.valid_dataset = CIFAR10SplitNN( + root=self.dataset_root, + train=False, + download=False, + transform=self.transform_valid, + returns=data_returns, + intersect_idx=None, # TODO: support validation intersect indices + ) + + self.train_size = len(self.train_dataset) + if self.train_size <= 0: + raise ValueError(f"Expected train dataset size to be larger zero but got {self.train_size}") + self.log_info(fl_ctx, f"Training with {self.train_size} overlapping indices of {self.train_dataset.orig_size}.") + + # Select local TensorBoard writer or event-based writer for streaming + if self.split_id == 1: # metrics can only be computed for client with labels + self.writer = parts.get(self.analytic_sender_id) # user configured config_fed_client.json for streaming + if not self.writer: # use local TensorBoard writer only + self.writer = SummaryWriter(self.app_root) + + # register aux message handlers + engine = fl_ctx.get_engine() + + if self.split_id == 1: + engine.register_aux_message_handler( + topic=SplitNNConstants.TASK_TRAIN_LABEL_STEP, message_handle_func=self._aux_train_label_side + ) + engine.register_aux_message_handler( + topic=SplitNNConstants.TASK_VALID_LABEL_STEP, message_handle_func=self._aux_val_label_side + ) + self.log_debug(fl_ctx, f"Registered aux message handlers for split_id {self.split_id}") + + self.compute_stats_pool = StatsPoolManager.add_time_hist_pool( + "Compute_Time", "Compute time in secs", scope=self.client_name + ) + + self.compute_stats_pool.record_value(category="initialize", value=timer() - t_start) + + """ training steps """ + + def _train_step_data_side(self, batch_indices): + t_start = timer() + self.model.train() + + inputs = self.train_dataset.get_batch(batch_indices) + inputs = inputs.to(self.device) + + self.train_activations = self.model.forward(inputs) # keep on site-1 + + self.compute_stats_pool.record_value(category="_train_step_data_side", value=timer() - t_start) + + return self.train_activations.detach().requires_grad_() # x to be sent to other client + + def _val_step_data_side(self, batch_indices): + t_start = timer() + self.model.eval() + + inputs = self.valid_dataset.get_batch(batch_indices) + inputs = inputs.to(self.device) + + _val_activations = self.model.forward(inputs) # keep on site-1 + + self.compute_stats_pool.record_value(category="_val_step_data_side", value=timer() - t_start) + + return _val_activations.detach().flatten(start_dim=1, end_dim=-1) # x to be sent to other client + + def _train_step_label_side(self, batch_indices, activations, fl_ctx: FLContext): + t_start = timer() + self.model.train() + self.optimizer.zero_grad() + + labels = self.train_dataset.get_batch(batch_indices) + labels = labels.to(self.device) + + if self.fp16: + activations = activations.type(torch.float32) # return to default pytorch precision + + activations = activations.to(self.device) + activations.requires_grad_(True) + + pred = self.model.forward(activations) + loss = self.criterion(pred, labels) + loss.backward() + + _, pred_labels = torch.max(pred, 1) + acc = (pred_labels == labels).sum() / len(labels) + + if self.current_round % 100 == 0: + self.log_info( + fl_ctx, + f"Round {self.current_round}/{self.num_rounds} train_loss: {loss.item():.4f}, train_accuracy: {acc.item():.4f}", + ) + if self.writer: + self.writer.add_scalar("train_loss", loss, self.current_round) + self.writer.add_scalar("train_accuracy", acc, self.current_round) + + self.optimizer.step() + + self.compute_stats_pool.record_value(category="_train_step_label_side", value=timer() - t_start) + + if not isinstance(activations.grad, torch.Tensor): + raise ValueError("No valid gradients available!") + # gradient to be returned to other client + if self.fp16: + return activations.grad.type(torch.float16) + else: + return activations.grad + + def _val_step_label_side(self, batch_indices, activations, fl_ctx: FLContext): + t_start = timer() + self.model.eval() + + labels = self.valid_dataset.get_batch(batch_indices) + labels = labels.to(self.device) + + if self.fp16: + activations = activations.type(torch.float32) # return to default pytorch precision + + activations = activations.to(self.device) + + pred = self.model.forward(activations) + loss = self.criterion(pred, labels) + self.val_loss.append(loss.unsqueeze(0)) # unsqueeze needed for later concatenation + + _, pred_labels = torch.max(pred, 1) + + self.val_pred_labels.extend(pred_labels.unsqueeze(0)) + self.val_labels.extend(labels.unsqueeze(0)) + + self.compute_stats_pool.record_value(category="_val_step_label_side", value=timer() - t_start) + + def _log_validation(self, fl_ctx: FLContext): + if len(self.val_loss) > 0: + loss = torch.mean(torch.cat(self.val_loss)) + + _val_pred_labels = torch.cat(self.val_pred_labels) + _val_labels = torch.cat(self.val_labels) + acc = (_val_pred_labels == _val_labels).sum() / len(_val_labels) + + self.log_info( + fl_ctx, + f"Round {self.current_round}/{self.num_rounds} val_loss: {loss.item():.4f}, val_accuracy: {acc.item():.4f}", + ) + if self.writer: + self.writer.add_scalar("val_loss", loss, self.current_round) + self.writer.add_scalar("val_accuracy", acc, self.current_round) + + self.val_loss = [] + self.val_labels = [] + self.val_pred_labels = [] + + def _backward_step_data_side(self, gradient, fl_ctx: FLContext): + t_start = timer() + self.model.train() + self.optimizer.zero_grad() + + if self.fp16: + gradient = gradient.type(torch.float32) # return to default pytorch precision + + gradient = gradient.to(self.device) + self.train_activations.backward(gradient=gradient.reshape(self.train_activations.shape)) + self.optimizer.step() + + self.log_debug( + fl_ctx, f"{self.client_name} runs model with `split_id` {self.split_id} for backward step on data side." + ) + self.compute_stats_pool.record_value(category="_backward_step_data_side", value=timer() - t_start) + + def _train_forward_backward_data_side(self, fl_ctx: FLContext, gradient=None) -> Shareable: + t_start = timer() + # combine forward and backward on data client + # 1. perform backward step if gradients provided + if gradient is not None: + result_backward = self._backward_data_side(gradient, fl_ctx=fl_ctx) + assert ( + result_backward.get_return_code() == ReturnCode.OK + ), f"Backward step failed with return code {result_backward.get_return_code()}" + # 2. compute activations + activations = self._train_data_side(fl_ctx=fl_ctx) + + self.compute_stats_pool.record_value(category="_train_forward_backward_data_side", value=timer() - t_start) + + return activations.flatten(start_dim=1, end_dim=-1) # keep batch dim + + def _train_data_side(self, fl_ctx: FLContext) -> Shareable: + t_start = timer() + if self.split_id != 0: + raise ValueError( + f"Expected `split_id` 0. It doesn't make sense to run `_train_data_side` with `split_id` {self.split_id}" + ) + + self.log_debug(fl_ctx, f"Train data side in round {self.current_round} of {self.num_rounds} rounds.") + + act = self._train_step_data_side(batch_indices=self.train_batch_indices) + + self.log_debug( + fl_ctx, f"{self.client_name} finished model with `split_id` {self.split_id} for train on data side." + ) + + self.compute_stats_pool.record_value(category="_train_data_side", value=timer() - t_start) + + self.log_debug(fl_ctx, f"Sending train data activations: {type(act)}") + + if self.fp16: + return act.type(torch.float16) + else: + return act + + def _aux_train_label_side(self, topic: str, request: Shareable, fl_ctx: FLContext) -> Shareable: + """train aux message handler""" + t_start = timer() + if self.split_id != 1: + raise ValueError( + f"Expected `split_id` 1. It doesn't make sense to run `_aux_train_label_side` with `split_id` {self.split_id}" + ) + + self.current_round = request.get_header(AppConstants.CURRENT_ROUND) + self.num_rounds = request.get_header(AppConstants.NUM_ROUNDS) + self.log_debug(fl_ctx, f"Train label in round {self.current_round} of {self.num_rounds} rounds.") + + dxo = from_shareable(request) + if dxo.data_kind != SplitNNDataKind.ACTIVATIONS: + raise ValueError(f"Expected data kind {SplitNNDataKind.ACTIVATIONS} but received {dxo.data_kind}") + + batch_indices = dxo.get_meta_prop(SplitNNConstants.BATCH_INDICES) + if batch_indices is None: + raise ValueError("No batch indices in DXO!") + + activations = dxo.data.get(SplitNNConstants.DATA) + if activations is None: + raise ValueError("No activations in DXO!") + + gradient = self._train_step_label_side( + batch_indices=batch_indices, activations=fobs.loads(activations), fl_ctx=fl_ctx + ) + + self.log_debug(fl_ctx, "_aux_train_label_side finished.") + return_shareable = DXO( + data={SplitNNConstants.DATA: fobs.dumps(gradient)}, data_kind=SplitNNDataKind.GRADIENT + ).to_shareable() + + self.compute_stats_pool.record_value(category="_aux_train_label_side", value=timer() - t_start) + + self.log_debug(fl_ctx, f"Sending train label return_shareable: {type(return_shareable)}") + return return_shareable + + def _aux_val_label_side(self, topic: str, request: Shareable, fl_ctx: FLContext) -> Shareable: + """validation aux message handler""" + t_start = timer() + if self.split_id != 1: + raise ValueError( + f"Expected `split_id` 1. It doesn't make sense to run `_aux_train_label_side` with `split_id` {self.split_id}" + ) + + val_round = request.get_header(AppConstants.CURRENT_ROUND) + val_num_rounds = request.get_header(AppConstants.NUM_ROUNDS) + self.log_debug(fl_ctx, f"Validate label in round {self.current_round} of {self.num_rounds} rounds.") + + dxo = from_shareable(request) + if dxo.data_kind != SplitNNDataKind.ACTIVATIONS: + raise ValueError(f"Expected data kind {SplitNNDataKind.ACTIVATIONS} but received {dxo.data_kind}") + + batch_indices = dxo.get_meta_prop(SplitNNConstants.BATCH_INDICES) + if batch_indices is None: + raise ValueError("No batch indices in DXO!") + + activations = dxo.data.get(SplitNNConstants.DATA) + if activations is None: + raise ValueError("No activations in DXO!") + + self._val_step_label_side(batch_indices=batch_indices, activations=fobs.loads(activations), fl_ctx=fl_ctx) + + if val_round == val_num_rounds - 1: + self._log_validation(fl_ctx) + + self.compute_stats_pool.record_value(category="_aux_val_label_side", value=timer() - t_start) + + return make_reply(ReturnCode.OK) + + def _backward_data_side(self, gradient, fl_ctx: FLContext) -> Shareable: + t_start = timer() + if self.split_id != 0: + raise ValueError( + f"Expected `split_id` 0. It doesn't make sense to run `_backward_data_side` with `split_id` {self.split_id}" + ) + + self._backward_step_data_side(gradient=fobs.loads(gradient), fl_ctx=fl_ctx) + + self.log_debug(fl_ctx, "_backward_data_side finished.") + + self.compute_stats_pool.record_value(category="_backward_data_side", value=timer() - t_start) + + return make_reply(ReturnCode.OK) + + # Model initialization task (one time only in beginning) + def init_model(self, shareable: Shareable, fl_ctx: FLContext, abort_signal: Signal) -> Shareable: + t_start = timer() + # Check abort signal + if abort_signal.triggered: + return make_reply(ReturnCode.TASK_ABORTED) + + # update local model weights with received weights + dxo = from_shareable(shareable) + global_weights = dxo.data + + # Before loading weights, tensors might need to be reshaped to support HE for secure aggregation. + local_var_dict = self.model.state_dict() + model_keys = global_weights.keys() + n_loaded = 0 + for var_name in local_var_dict: + if abort_signal.triggered: + return make_reply(ReturnCode.TASK_ABORTED) + if var_name in model_keys: + weights = global_weights[var_name] + try: + # reshape global weights to compute difference later on + global_weights[var_name] = np.reshape(weights, local_var_dict[var_name].shape) + # update the local dict + local_var_dict[var_name] = torch.as_tensor(global_weights[var_name]) + n_loaded += 1 + except Exception as e: + raise ValueError(f"Convert weight from {var_name} failed.") from e + self.model.load_state_dict(local_var_dict) + if n_loaded == 0: + raise ValueError("No global weights loaded!") + + self.compute_stats_pool.record_value(category="init_model", value=timer() - t_start) + + self.log_info(fl_ctx, "init_model finished.") + + return make_reply(ReturnCode.OK) + + def train(self, shareable: Shareable, fl_ctx: FLContext, abort_signal: Signal) -> Shareable: + t_start = timer() + """main training logic""" + engine = fl_ctx.get_engine() + + self.num_rounds = shareable.get_header(AppConstants.NUM_ROUNDS) + if not self.num_rounds: + raise ValueError("No number of rounds available.") + self.batch_size = shareable.get_header(SplitNNConstants.BATCH_SIZE) + self.target_names = np.asarray( + shareable.get_header(SplitNNConstants.TARGET_NAMES) + ) # convert to array for string matching below + self.other_client = self.target_names[self.target_names != self.client_name][0] + self.log_info(fl_ctx, f"Starting training of {self.num_rounds} rounds with batch size {self.batch_size}") + + gradients = None # initial gradients + for _curr_round in range(self.num_rounds): + self.current_round = _curr_round + if self.split_id != 0: + continue # only run this logic on first site + if abort_signal.triggered: + return make_reply(ReturnCode.TASK_ABORTED) + + self.log_debug(fl_ctx, f"Starting current round={self.current_round} of {self.num_rounds}.") + self.train_batch_indices = np.random.randint(0, self.train_size - 1, self.batch_size) + + # Site-1 image forward & backward (from 2nd round) + fl_ctx.set_prop(AppConstants.CURRENT_ROUND, self.current_round, private=True, sticky=False) + activations = self._train_forward_backward_data_side(fl_ctx, gradients) + + # Site-2 label loss & backward + dxo = DXO(data={SplitNNConstants.DATA: fobs.dumps(activations)}, data_kind=SplitNNDataKind.ACTIVATIONS) + dxo.set_meta_prop(SplitNNConstants.BATCH_INDICES, self.train_batch_indices) + + data_shareable = dxo.to_shareable() + data_shareable.set_header(AppConstants.CURRENT_ROUND, self.current_round) + data_shareable.set_header(AppConstants.NUM_ROUNDS, self.num_rounds) + data_shareable.add_cookie(AppConstants.CONTRIBUTION_ROUND, self.current_round) + + # send to other side + result = engine.send_aux_request( + targets=self.other_client, + topic=SplitNNConstants.TASK_TRAIN_LABEL_STEP, + request=data_shareable, + timeout=SplitNNConstants.TIMEOUT, + fl_ctx=fl_ctx, + ) + shareable = result.get(self.other_client) + if shareable is not None: + dxo = from_shareable(shareable) + if dxo.data_kind != SplitNNDataKind.GRADIENT: + raise ValueError(f"Expected data kind {SplitNNDataKind.GRADIENT} but received {dxo.data_kind}") + gradients = dxo.data.get(SplitNNConstants.DATA) + else: + raise ValueError(f"No message returned from {self.other_client}!") + + self.log_debug(fl_ctx, f"Ending current round={self.current_round}.") + + if self.val_freq > 0: + if _curr_round % self.val_freq == 0: + self._validate(fl_ctx) + + self.compute_stats_pool.record_value(category="train", value=timer() - t_start) + + return make_reply(ReturnCode.OK) + + def _validate(self, fl_ctx: FLContext): + t_start = timer() + engine = fl_ctx.get_engine() + + idx = np.arange(len(self.valid_dataset)) + n_batches = int(np.ceil(len(self.valid_dataset) / self.batch_size)) + for _val_round, _val_batch_indices in enumerate(np.array_split(idx, n_batches)): + activations = self._val_step_data_side(batch_indices=_val_batch_indices) + + # Site-2 label loss & accuracy + dxo = DXO(data={SplitNNConstants.DATA: fobs.dumps(activations)}, data_kind=SplitNNDataKind.ACTIVATIONS) + dxo.set_meta_prop(SplitNNConstants.BATCH_INDICES, _val_batch_indices) + + data_shareable = dxo.to_shareable() + data_shareable.set_header(AppConstants.CURRENT_ROUND, _val_round) + data_shareable.set_header(AppConstants.NUM_ROUNDS, n_batches) + data_shareable.add_cookie(AppConstants.CONTRIBUTION_ROUND, _val_round) + + # send to other side to validate + engine.send_aux_request( + targets=self.other_client, + topic=SplitNNConstants.TASK_VALID_LABEL_STEP, + request=data_shareable, + timeout=SplitNNConstants.TIMEOUT, + fl_ctx=fl_ctx, + ) + + self.compute_stats_pool.record_value(category="_validate", value=timer() - t_start) + + self.log_debug(fl_ctx, "finished validation.") diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_splitnn_dataset.py b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_splitnn_dataset.py new file mode 100644 index 0000000000..0b8135d2d6 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_splitnn_dataset.py @@ -0,0 +1,90 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import torch +from torchvision import datasets + + +class CIFAR10SplitNN(object): # TODO: use torch.utils.data.Dataset with batch sampling + def __init__(self, root, train=True, transform=None, download=False, returns="all", intersect_idx=None): + """CIFAR-10 dataset with index to extract a mini-batch based on given batch indices + Useful for SplitNN training + + Args: + root: data root + data_idx: to specify the data for a particular client site. + If index provided, extract subset, otherwise use the whole set + train: whether to use the training or validation split (default: True) + transform: image transforms + download: whether to download the data (default: False) + returns: specify which data the client has + intersect_idx: indices of samples intersecting between both + participating sites. Intersection indices will be sorted to + ensure that data is aligned on both sites. + Returns: + A PyTorch dataset + """ + self.root = root + self.train = train + self.transform = transform + self.download = download + self.returns = returns + self.intersect_idx = intersect_idx + self.orig_size = 0 + + if self.intersect_idx is not None: + self.intersect_idx = np.sort(self.intersect_idx).astype(np.int64) + + self.data, self.target = self.__build_cifar_subset__() + + def __build_cifar_subset__(self): + # if intersect index provided, extract subset, otherwise use the whole + # set + cifar_dataobj = datasets.CIFAR10(self.root, self.train, self.transform, self.download) + data = cifar_dataobj.data + target = np.array(cifar_dataobj.targets) + self.orig_size = len(data) + if self.intersect_idx is not None: + data = data[self.intersect_idx] + target = target[self.intersect_idx] + return data, target + + def __getitem__(self, index): + img, target = self.data[index], self.target[index] + if self.transform is not None: + img = self.transform(img) + return img, target + + # TODO: this can probably made more efficient using batch_sampler + def get_batch(self, batch_indices): + img_batch = [] + target_batch = [] + for idx in batch_indices: + img, target = self.__getitem__(idx) + img_batch.append(img) + target_batch.append(torch.tensor(target, dtype=torch.long)) + img_batch = torch.stack(img_batch, dim=0) + target_batch = torch.stack(target_batch, dim=0) + if self.returns == "all": + return img_batch, target_batch + elif self.returns == "image": + return img_batch + elif self.returns == "label": + return target_batch + else: + raise ValueError(f"Expected `returns` to be 'all', 'image', or 'label', but got '{self.returns}'") + + def __len__(self): + return len(self.data) diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_vertical_data_splitter.py b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_vertical_data_splitter.py new file mode 100644 index 0000000000..9c1f6f7ca3 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/cifar10_vertical_data_splitter.py @@ -0,0 +1,92 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os + +import numpy as np +from pt.utils.cifar10_data_utils import get_site_class_summary, load_cifar10_data + +from nvflare.apis.event_type import EventType +from nvflare.apis.fl_component import FLComponent +from nvflare.apis.fl_context import FLContext + + +class Cifar10VerticalDataSplitter(FLComponent): + def __init__(self, split_dir: str = None, overlap: int = 10_000, seed: int = 0): + super().__init__() + self.split_dir = split_dir + self.overlap = overlap + self.seed = seed + + if self.split_dir is None: + raise ValueError("You need to define a valid `split_dir` when splitting the data.") + if overlap <= 0: + raise ValueError(f"Alpha should be larger 0 but was {overlap}!") + + def handle_event(self, event_type: str, fl_ctx: FLContext): + if event_type == EventType.START_RUN: + self.split(fl_ctx) + + def split(self, fl_ctx: FLContext): + np.random.seed(self.seed) + + self.log_info(fl_ctx, f"Partition CIFAR-10 dataset into vertically with {self.overlap} overlapping samples.") + site_idx, class_sum = self._split_data() + + # write to files + if not os.path.isdir(self.split_dir): + os.makedirs(self.split_dir) + sum_file_name = os.path.join(self.split_dir, "summary.txt") + with open(sum_file_name, "w") as sum_file: + sum_file.write("Class counts for overlap: \n") + sum_file.write(json.dumps(class_sum)) + + for _site, _idx in site_idx.items(): + site_file_name = os.path.join(self.split_dir, f"{_site}.npy") + self.log_info(fl_ctx, f"save {site_file_name}") + np.save(site_file_name, _idx) + + def _split_data(self): + train_label = load_cifar10_data() + + n_samples = len(train_label) + + if self.overlap > n_samples: + raise ValueError( + f"Chosen overlap of {self.overlap} is larger than " f"train dataset with {n_samples} entries." + ) + + sample_idx = np.arange(0, n_samples) + + overlap_idx = np.random.choice(sample_idx, size=np.int64(self.overlap), replace=False) + + remain_idx = list(set(sample_idx) - set(overlap_idx)) + + idx_1 = np.concatenate((overlap_idx, np.array(remain_idx))) + # adding n_samples to remain_idx of site-2 to make sure no overlap + # with idx_1 + idx_2 = np.concatenate((overlap_idx, np.array(remain_idx) + n_samples)) + + # shuffle indexes again for client sites to simulate real world + # scenario + np.random.shuffle(idx_1) + np.random.shuffle(idx_2) + + site_idx = {"overlap": overlap_idx, "site-1": idx_1, "site-2": idx_2} + + # collect class summary + class_sum = get_site_class_summary(train_label, {"overlap": overlap_idx}) + + return site_idx, class_sum diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/split_nn.py b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/split_nn.py new file mode 100644 index 0000000000..01a6387bfb --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/src/splitnn/split_nn.py @@ -0,0 +1,38 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pt.networks.cifar10_nets import ModerateCNN + + +class SplitNN(ModerateCNN): + def __init__(self, split_id): + super().__init__() + if split_id not in [0, 1]: + raise ValueError(f"Only supports split_id '0' or '1' but was {self.split_id}") + self.split_id = split_id + + if self.split_id == 0: + self.split_forward = self.conv_layer + elif self.split_id == 1: + self.split_forward = self.fc_layer + else: + raise ValueError(f"Expected split_id to be '0' or '1' but was {self.split_id}") + + def forward(self, x): + x = self.split_forward(x) + return x + + def get_split_id(self): + return self.split_id diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/virtualenv/min-requirements.txt b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/virtualenv/min-requirements.txt new file mode 100644 index 0000000000..db42752c61 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/virtualenv/min-requirements.txt @@ -0,0 +1,7 @@ +nvflare>=2.3.0 +torch +torchvision +tensorboard +openmined.psi +pandas +jupyterlab \ No newline at end of file diff --git a/examples/tutorial/vertical_federated_learning/cifar10-splitnn/virtualenv/set_env.sh b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/virtualenv/set_env.sh new file mode 100755 index 0000000000..8e3de2cf15 --- /dev/null +++ b/examples/tutorial/vertical_federated_learning/cifar10-splitnn/virtualenv/set_env.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +export projectname='nvflare_cifar10' + +python3 -m venv ${projectname} +source ${projectname}/bin/activate diff --git a/nvflare/app_common/executors/splitnn_learner_executor.py b/nvflare/app_common/executors/splitnn_learner_executor.py new file mode 100644 index 0000000000..4815474705 --- /dev/null +++ b/nvflare/app_common/executors/splitnn_learner_executor.py @@ -0,0 +1,92 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nvflare.apis.event_type import EventType +from nvflare.apis.executor import Executor +from nvflare.apis.fl_constant import ReturnCode +from nvflare.apis.fl_context import FLContext +from nvflare.apis.shareable import Shareable, make_reply +from nvflare.apis.signal import Signal +from nvflare.app_common.abstract.learner_spec import Learner +from nvflare.app_common.workflows.splitnn_workflow import SplitNNConstants + + +class SplitNNLearnerExecutor(Executor): + def __init__( + self, + learner_id, + init_model_task_name=SplitNNConstants.TASK_INIT_MODEL, + train_task_name=SplitNNConstants.TASK_TRAIN, + ): + """Key component to run learner on clients. + + Args: + learner_id (str): id pointing to the learner object + train_task_name (str, optional): label to dispatch train task. Defaults to AppConstants.TASK_TRAIN. + submit_model_task_name (str, optional): label to dispatch submit model task. Defaults to AppConstants.TASK_SUBMIT_MODEL. + validate_task_name (str, optional): label to dispatch validation task. Defaults to AppConstants.TASK_VALIDATION. + """ + super().__init__() + self.learner_id = learner_id + self.learner = None + self.init_model_task_name = init_model_task_name + self.train_task_name = train_task_name + + def handle_event(self, event_type: str, fl_ctx: FLContext): + if event_type == EventType.START_RUN: + self.initialize(fl_ctx) + elif event_type == EventType.ABORT_TASK: + try: + if self.learner: + self.learner.abort(fl_ctx) + except Exception as e: + self.log_exception(fl_ctx, f"learner abort exception: {e}") + elif event_type == EventType.END_RUN: + self.finalize(fl_ctx) + + def initialize(self, fl_ctx: FLContext): + try: + engine = fl_ctx.get_engine() + self.learner = engine.get_component(self.learner_id) + if not isinstance(self.learner, Learner): + raise TypeError(f"learner must be Learner type. Got: {type(self.learner)}") + self.learner.initialize(engine.get_all_components(), fl_ctx) + except Exception as e: + self.log_exception(fl_ctx, f"learner initialize exception: {e}") + + def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, abort_signal: Signal) -> Shareable: + self.log_info(fl_ctx, f"Client trainer got task: {task_name}") + + self.log_info(fl_ctx, f"Executing task {task_name}...") + try: + if task_name == self.init_model_task_name: + self.log_info(fl_ctx, "Initializing model...") + return self.learner.init_model(shareable=shareable, fl_ctx=fl_ctx, abort_signal=abort_signal) + elif task_name == self.train_task_name: + self.log_info(fl_ctx, "Running training...") + return self.learner.train(shareable=shareable, fl_ctx=fl_ctx, abort_signal=abort_signal) + else: + self.log_error(fl_ctx, f"Could not handle task: {task_name}") + return make_reply(ReturnCode.TASK_UNKNOWN) + except Exception as e: + # Task execution error, return EXECUTION_EXCEPTION Shareable + self.log_exception(fl_ctx, f"learner execute exception: {e}") + return make_reply(ReturnCode.EXECUTION_EXCEPTION) + + def finalize(self, fl_ctx: FLContext): + try: + if self.learner: + self.learner.finalize(fl_ctx) + except Exception as e: + self.log_exception(fl_ctx, f"learner finalize exception: {e}") diff --git a/nvflare/app_common/workflows/splitnn_workflow.py b/nvflare/app_common/workflows/splitnn_workflow.py new file mode 100644 index 0000000000..33550e8a87 --- /dev/null +++ b/nvflare/app_common/workflows/splitnn_workflow.py @@ -0,0 +1,286 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nvflare.apis.client import Client +from nvflare.apis.fl_constant import ReturnCode +from nvflare.apis.fl_context import FLContext +from nvflare.apis.impl.controller import ClientTask, Controller, Task +from nvflare.apis.shareable import Shareable +from nvflare.apis.signal import Signal +from nvflare.app_common.abstract.learnable_persistor import LearnablePersistor +from nvflare.app_common.abstract.shareable_generator import ShareableGenerator +from nvflare.app_common.app_constant import AppConstants +from nvflare.app_common.app_event_type import AppEventType +from nvflare.widgets.info_collector import GroupInfoCollector, InfoCollector + + +class SplitNNDataKind(object): + ACTIVATIONS = "_splitnn_activations_" + GRADIENT = "_splitnn_gradient_" + + +class SplitNNConstants(object): + BATCH_INDICES = "_splitnn_batch_indices_" + DATA = "_splitnn_data_" + BATCH_SIZE = "_splitnn_batch_size_" + TARGET_NAMES = "_splitnn_target_names_" + + TASK_INIT_MODEL = "_splitnn_task_init_model_" + TASK_TRAIN_LABEL_STEP = "_splitnn_task_train_label_step_" + TASK_VALID_LABEL_STEP = "_splitnn_task_valid_label_step_" + TASK_TRAIN = "_splitnn_task_train_" + + TASK_RESULT = "_splitnn_task_result_" + TIMEOUT = 60.0 # timeout for waiting for reply from aux message request + + +class SplitNNController(Controller): + def __init__( + self, + num_rounds: int = 5000, + start_round: int = 0, + persistor_id=AppConstants.DEFAULT_PERSISTOR_ID, # used to init the models on both clients + shareable_generator_id=AppConstants.DEFAULT_SHAREABLE_GENERATOR_ID, + init_model_task_name=SplitNNConstants.TASK_INIT_MODEL, + train_task_name=SplitNNConstants.TASK_TRAIN, + task_timeout: int = 10, + ignore_result_error: bool = True, + batch_size: int = 256, + ): + """The controller for Split Learning Workflow. + + The SplitNNController workflow defines Federated training on all clients. + The model persistor (persistor_id) is used to load the initial global model which is sent to all clients. + Each clients sends it's updated weights after local training which is aggregated (aggregator_id). The + shareable generator is used to convert the aggregated weights to shareable and shareable back to weights. + The model_persistor also saves the model after training. + + Args: + num_rounds (int, optional): The total number of training rounds. Defaults to 5. + start_round (int, optional): Start round for training. Defaults to 0. + persistor_id (str, optional): ID of the persistor component. Defaults to "persistor". + shareable_generator_id (str, optional): ID of the shareable generator. Defaults to "shareable_generator". + init_model_task_name: Task name used to initialize the local models. + train_task_name: Task name used for split learning. + task_timeout (int, optional): timeout (in sec) to determine if one client fails + to request the task which it is assigned to. Defaults to 10. + ignore_result_error (bool, optional): whether this controller can proceed if result has errors. Defaults to True. + Raises: + TypeError: when any of input arguments does not have correct type + ValueError: when any of input arguments is out of range + """ + Controller.__init__(self) + + # Check arguments + if not isinstance(num_rounds, int): + raise TypeError("`num_rounds` must be int but got {}".format(type(num_rounds))) + if not isinstance(start_round, int): + raise TypeError("`start_round` must be int but got {}".format(type(start_round))) + if not isinstance(task_timeout, int): + raise TypeError("`train_timeout` must be int but got {}".format(type(task_timeout))) + if not isinstance(persistor_id, str): + raise TypeError("`persistor_id` must be a string but got {}".format(type(persistor_id))) + if not isinstance(shareable_generator_id, str): + raise TypeError("`shareable_generator_id` must be a string but got {}".format(type(shareable_generator_id))) + if not isinstance(init_model_task_name, str): + raise TypeError("`init_model_task_name` must be a string but got {}".format(type(init_model_task_name))) + if not isinstance(train_task_name, str): + raise TypeError("`train_task_name` must be a string but got {}".format(type(train_task_name))) + if num_rounds < 0: + raise ValueError("num_rounds must be greater than or equal to 0.") + if start_round < 0: + raise ValueError("start_round must be greater than or equal to 0.") + + self.persistor_id = persistor_id + self.shareable_generator_id = shareable_generator_id + self.persistor = None + self.shareable_generator = None + + # config data + self._num_rounds = num_rounds + self._start_round = start_round + self._task_timeout = task_timeout + self.ignore_result_error = ignore_result_error + + # workflow phases: init, train, validate + self._phase = AppConstants.PHASE_INIT + self._global_weights = None + self._current_round = None + + # task names + self.init_model_task_name = init_model_task_name + self.train_task_name = train_task_name + + self.targets_names = ["site-1", "site-2"] + self.nr_supported_clients = 2 + self.batch_size = batch_size + + def start_controller(self, fl_ctx: FLContext): + self.log_debug(fl_ctx, "starting controller") + self.persistor = fl_ctx.get_engine().get_component(self.persistor_id) + self.shareable_generator = fl_ctx.get_engine().get_component(self.shareable_generator_id) + if not isinstance(self.persistor, LearnablePersistor): + self.system_panic( + f"Persistor {self.persistor_id} must be a Persistor instance, but got {type(self.persistor)}", fl_ctx + ) + if not isinstance(self.shareable_generator, ShareableGenerator): + self.system_panic( + f"Shareable generator {self.shareable_generator_id} must be a Shareable Generator instance, " + f"but got {type(self.shareable_generator)}", + fl_ctx, + ) + + # initialize global model + fl_ctx.set_prop(AppConstants.START_ROUND, self._start_round, private=True, sticky=True) + fl_ctx.set_prop(AppConstants.NUM_ROUNDS, self._num_rounds, private=True, sticky=False) + self._global_weights = self.persistor.load(fl_ctx) + fl_ctx.set_prop(AppConstants.GLOBAL_MODEL, self._global_weights, private=True, sticky=True) + self.fire_event(AppEventType.INITIAL_MODEL_LOADED, fl_ctx) + + def _process_result(self, client_task: ClientTask, fl_ctx: FLContext) -> bool: + # submitted shareable is stored in client_task.result + # we need to update task.data with that shareable so the next target + # will get the updated shareable + task = client_task.task + result = client_task.result + rc = result.get_return_code() + + if rc and rc != ReturnCode.OK: + if self.ignore_result_error: + self.log_error(fl_ctx, f"Ignore the task {task} result. Train result error code: {rc}") + return False + else: + if rc in [ReturnCode.MISSING_PEER_CONTEXT, ReturnCode.BAD_PEER_CONTEXT]: + self.system_panic( + f"Peer context for task {task} is bad or missing. SplitNNController exiting.", fl_ctx=fl_ctx + ) + return False + elif rc in [ReturnCode.EXECUTION_EXCEPTION, ReturnCode.TASK_UNKNOWN]: + self.system_panic( + f"Execution Exception in client task {task}. SplitNNController exiting.", fl_ctx=fl_ctx + ) + return False + elif rc in [ + ReturnCode.EXECUTION_RESULT_ERROR, + ReturnCode.TASK_DATA_FILTER_ERROR, + ReturnCode.TASK_RESULT_FILTER_ERROR, + ]: + self.system_panic( + f"Execution result for task {task} is not a shareable. SplitNNController exiting.", + fl_ctx=fl_ctx, + ) + return False + + # assign result to current task + if result: + task.set_prop(SplitNNConstants.TASK_RESULT, result) + + return True + + def _check_targets(self, fl_ctx: FLContext): + engine = fl_ctx.get_engine() + targets = engine.get_clients() + for t in targets: + if t.name not in self.targets_names: + self.system_panic(f"Client {t.name} not in expected target names: {self.targets_names}", fl_ctx) + + def _init_models(self, abort_signal: Signal, fl_ctx: FLContext): + self._check_targets(fl_ctx) + self.log_debug(fl_ctx, f"SplitNN initializing model {self.targets_names}.") + + # Create init_model_task_name + data_shareable: Shareable = self.shareable_generator.learnable_to_shareable(self._global_weights, fl_ctx) + task = Task( + name=self.init_model_task_name, + data=data_shareable, + result_received_cb=self._process_result, + ) + + self.broadcast_and_wait( + task=task, + min_responses=self.nr_supported_clients, + wait_time_after_min_received=0, + fl_ctx=fl_ctx, + abort_signal=abort_signal, + ) + + def _train(self, abort_signal: Signal, fl_ctx: FLContext): + self._check_targets(fl_ctx) + self.log_debug(fl_ctx, f"SplitNN training starting with {self.targets_names}.") + + # Create train_task + data_shareable: Shareable = Shareable() + data_shareable.set_header(AppConstants.NUM_ROUNDS, self._num_rounds) + data_shareable.set_header(SplitNNConstants.BATCH_SIZE, self.batch_size) + data_shareable.set_header(SplitNNConstants.TARGET_NAMES, self.targets_names) + + task = Task( + name=self.train_task_name, + data=data_shareable, + result_received_cb=self._process_result, + ) + + self.broadcast_and_wait( + task=task, + min_responses=self.nr_supported_clients, + wait_time_after_min_received=0, + fl_ctx=fl_ctx, + abort_signal=abort_signal, + ) + + def control_flow(self, abort_signal: Signal, fl_ctx: FLContext): + try: + self._check_targets(fl_ctx) + self.log_debug(fl_ctx, f"Train with on {self.targets_names}") + + # 1. initialize models on clients + self._init_models(abort_signal=abort_signal, fl_ctx=fl_ctx) + + # 2. Start split learning + self._phase = AppConstants.PHASE_TRAIN + self._train(abort_signal=abort_signal, fl_ctx=fl_ctx) + + self._phase = AppConstants.PHASE_FINISHED + self.log_debug(fl_ctx, "SplitNN training ended.") + except BaseException as e: + error_msg = f"SplitNN control_flow exception {e}" + self.log_error(fl_ctx, error_msg) + self.system_panic(str(e), fl_ctx) + + def stop_controller(self, fl_ctx: FLContext): + self._phase = AppConstants.PHASE_FINISHED + self.log_debug(fl_ctx, "controller stopped") + + def process_result_of_unknown_task( + self, + client: Client, + task_name: str, + client_task_id: str, + result: Shareable, + fl_ctx: FLContext, + ): + self.log_warning(fl_ctx, f"Dropped result of unknown task: {task_name} from client {client.name}.") + + def handle_event(self, event_type: str, fl_ctx: FLContext): + super().handle_event(event_type, fl_ctx) + if event_type == InfoCollector.EVENT_TYPE_GET_STATS: + collector = fl_ctx.get_prop(InfoCollector.CTX_KEY_STATS_COLLECTOR, None) + if collector: + if not isinstance(collector, GroupInfoCollector): + raise TypeError("collector must be GroupInfoCollector but got {}".format(type(collector))) + + collector.add_info( + group_name=self._name, + info={"phase": self._phase, "current_round": self._current_round, "num_rounds": self._num_rounds}, + ) diff --git a/nvflare/app_opt/pt/decomposers.py b/nvflare/app_opt/pt/decomposers.py new file mode 100644 index 0000000000..f0884008f2 --- /dev/null +++ b/nvflare/app_opt/pt/decomposers.py @@ -0,0 +1,38 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from io import BytesIO +from typing import Any + +import numpy as np +import torch + +from nvflare.fuel.utils import fobs + + +class TensorDecomposer(fobs.Decomposer): + def supported_type(self): + return torch.Tensor + + def decompose(self, target: torch.Tensor) -> Any: + stream = BytesIO() + # torch.save uses Pickle so converting Tensor to ndarray first + array = target.detach().cpu().numpy() + np.save(stream, array, allow_pickle=False) + return stream.getvalue() + + def recompose(self, data: Any) -> torch.Tensor: + stream = BytesIO(data) + array = np.load(stream, allow_pickle=False) + return torch.from_numpy(array)