From d5f4146a48960e4c019d5d226f29937969db87c9 Mon Sep 17 00:00:00 2001 From: Chen Jing Date: Mon, 29 Aug 2022 21:48:10 +0800 Subject: [PATCH 1/3] Update the tutorial doc Signed-off-by: Chen Jing --- ...ster_in_One_Linux_Machine_with_MiniKube.md | 412 +++++++++++------- ...r_in_One_Linux_Machine_with_MiniKube_zh.md | 385 ++++++++++------ 2 files changed, 514 insertions(+), 283 deletions(-) diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md index 9588d951a..920fddd16 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md @@ -6,7 +6,7 @@ learning with these two parties, and check FATE-Dashboard for the status of the After the tutorial, the deployment architecture looks like the following diagram.
- +
# Prerequisites @@ -26,8 +26,8 @@ export fate_version=v1.9.0 && export kubefate_version=v1.4.5 && cd ~ && mkdir de Notes: * When talking about KubeFATE version, usually there are 3 notions: - * The KubeFATE CLI version, in this tutorial, it is v1.4.5, consider KubeCtl as an example. - * The KubeFATE service version, in this tutorial, it is v1.4.5, consider Kubernetes as an example. + * The KubeFATE CLI version, in this tutorial, it is v1.4.5. + * The KubeFATE service version, in this tutorial, it is v1.4.5. * The FATE version, in this tutorial, it is v1.9.0, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. * **In this tutorial, the IP of the machine we used is 192.168.100.123. Please change it to your machine's IP in all the following commands and config files.** @@ -54,8 +54,8 @@ curl -LO https://github.com/kubernetes/minikube/releases/download/v1.19.0/miniku Try to verify if MiniKube installed, ``` kubefate@machine:~/demo$ minikube version -minikube version: v1.21.0 -commit: 76d74191d82c47883dc7e1319ef7cebd3e00ee11 +minikube version: v1.19.0 +commit: 15cede53bdc5fe242228853e737333b09d4336b5 ``` ### Install Kubernetes with MiniKube @@ -63,7 +63,7 @@ In a Linux machine, we suggest using Docker as the hypervisor, which is easy. Th [Install MiniKube - Install a Hypervisor](https://kubernetes.io/docs/tasks/tools/install-minikube/#install-a-hypervisor). It is only one command, ``` -sudo minikube start --vm-driver=none --kubernetes-version v1.19.0 +sudo minikube start --vm-driver=none --kubernetes-version v1.19.0 --cni=flannel ``` Wait a few seconds until the command finishes, then try to verify if Kubernetes installed, ``` @@ -80,7 +80,9 @@ another command. ``` sudo minikube addons enable ingress ``` -Till now, Kubernetes have been ready. + +Check the pod status by `kubectl get pods -A` +When all the pods are in the ready state, it means your Kubernetes cluster is ready. ## Setup Kubefate ### Install KubeFATE CLI @@ -96,11 +98,11 @@ kubefate@machine:~/kubefate ls cluster-serving.yaml cluster-spark-rabbitmq.yaml cluster.yaml examples rbac-config.yaml cluster-spark-pulsar.yaml cluster-spark-slim.yaml config.yaml kubefate.yaml ``` -Move the kubefate executable binary to path, +Move the KubeFATE executable binary to path, ``` chmod +x ./kubefate && sudo mv ./kubefate /usr/bin ``` -Try to verify if kubefate works, +Try to verify if the KubeFATE CLI works, ``` kubefate@machine:~/kubefate$ kubefate version * kubefate commandLine version=v1.4.5 @@ -117,7 +119,7 @@ kubefate@machine:~/kubefate$ kubefate version ``` -It is fine only the command line version shows and get an error on KubeFATE service's version because we have not +It is fine that only the command line version shows up but get an error on KubeFATE service's version because we have not deployed the KubeFATE service yet. ### Deploy KubeFATE service @@ -227,16 +229,36 @@ kubectl create namespace fate-9999 kubectl create namespace fate-10000 ``` -We have 2 preset examples in `/kubefate/examples/party-9999/` and `/kubefate/examples/party-10000`. +If you are using dockerhub as the registry, you can create the docker credentials for these 2 namespaces because we will have several images to download. + +``` +DOCKER_REGISTRY_SERVER=docker.io +DOCKER_USER= +DOCKER_PASSWORD= + +kubectl -n fate-9999 create secret docker-registry myregistrykey \ + --docker-server=$DOCKER_REGISTRY_SERVER \ + --docker-username=$DOCKER_USER \ + --docker-password=$DOCKER_PASSWORD + +kubectl -n fate-10000 create secret docker-registry myregistrykey \ + --docker-server=$DOCKER_REGISTRY_SERVER \ + --docker-username=$DOCKER_USER \ + --docker-password=$DOCKER_PASSWORD +``` + +We have several preset examples in `/kubefate/examples/party-9999/` and `/kubefate/examples/party-10000`. + +In this tutorial, we will take the Spark+Pulsar architecture as the example -For `/kubefate/examples/party-9999/cluster.yaml`, modify it as following: +For `/kubefate/examples/party-9999/cluster-spark-pulsar.yaml`, modify it as following: ``` name: fate-9999 namespace: fate-9999 chartName: fate chartVersion: v1.9.0 partyId: 9999 -registry: "hub.c.163.com/federatedai" +registry: "" pullPolicy: imagePullSecrets: - name: myregistrykey @@ -247,31 +269,34 @@ podSecurityPolicy: enabled: false ingressClassName: nginx modules: - - rollsite - - clustermanager - - nodemanager - - mysql - python + - mysql - fateboard - client + - spark + - hdfs + - nginx + - pulsar -backend: eggroll +computing: Spark +federation: Pulsar +storage: HDFS +algorithm: Basic +device: CPU ingress: fateboard: hosts: - name: party9999.fateboard.example.com - client: + client: hosts: - name: party9999.notebook.example.com - -rollsite: - type: NodePort - nodePort: 30091 - partyList: - - partyId: 10000 - partyIp: 192.168.100.123 - partyPort: 30101 + spark: + hosts: + - name: party9999.spark.example.com + pulsar: + hosts: + - name: party9999.pulsar.example.com python: type: NodePort @@ -279,8 +304,36 @@ python: grpcNodePort: 30092 logLevel: INFO -servingIp: 192.168.100.123 +servingIp: 10.182.137.144 servingPort: 30095 + +nginx: + type: NodePort + httpNodePort: 30093 + grpcNodePort: 30098 + route_table: + 10000: + fateflow: + - host: 10.182.137.144 + http_port: 30103 + grpc_port: 30108 + +pulsar: + type: NodePort + httpNodePort: 30094 + httpsNodePort: 30099 + publicLB: + enabled: false + route_table: + 9999: + host: pulsar + port: 6650 + sslPort: 6651 + 10000: + host: 10.182.137.144 + port: 30104 + sslPort: 30109 + proxy: "" ``` and for fate-10000: ``` @@ -289,42 +342,45 @@ namespace: fate-10000 chartName: fate chartVersion: v1.9.0 partyId: 10000 -registry: "hub.c.163.com/federatedai" +registry: "" pullPolicy: imagePullSecrets: - name: myregistrykey persistence: false istio: enabled: false +ingressClassName: nginx podSecurityPolicy: enabled: false -ingressClassName: nginx modules: - - rollsite - - clustermanager - - nodemanager - - mysql - python + - mysql - fateboard - client + - spark + - hdfs + - nginx + - pulsar -backend: eggroll +computing: Spark +federation: Pulsar +storage: HDFS +algorithm: Basic +device: CPU ingress: fateboard: hosts: - name: party10000.fateboard.example.com - client: + client: hosts: - name: party10000.notebook.example.com - -rollsite: - type: NodePort - nodePort: 30101 - partyList: - - partyId: 9999 - partyIp: 192.168.100.123 - partyPort: 30091 + spark: + hosts: + - name: party10000.spark.example.com + pulsar: + hosts: + - name: party10000.pulsar.example.com python: type: NodePort @@ -332,8 +388,36 @@ python: grpcNodePort: 30102 logLevel: INFO -servingIp: 192.168.100.123 +servingIp: 10.182.137.144 servingPort: 30105 + +nginx: + type: NodePort + httpNodePort: 30103 + grpcNodePort: 30108 + route_table: + 9999: + fateflow: + - host: 10.182.137.144 + http_port: 30093 + grpc_port: 30098 + +pulsar: + type: NodePort + httpNodePort: 30104 + httpsNodePort: 30109 + publicLB: + enabled: false + route_table: + 9999: + host: 10.182.137.144 + port: 30094 + sslPort: 30099 + proxy: "" + 10000: + host: pulsar + port: 6650 + sslPort: 6651 ``` For the two files, pay extra attention of modify the partyId to the correct number otherwise you are not able to access the notebook or the fateboard. @@ -345,9 +429,9 @@ https://githubcom/FederatedAI/KubeFATE/blob/master/docs/configurations/FATE_clus ### Install the FATE clusters Okay, we can start to install these two FATE cluster via KubeFATE with the following command: ``` -kubefate@machine:~/kubefate$ kubefate cluster install -f examples/party-9999/cluster.yaml +kubefate@machine:~/kubefate$ kubefate cluster install -f examples/party-9999/cluster-spark-pulsar.yaml create job success, job id=2c1d926c-bb57-43d3-9127-8cf3fc6deb4b -kubefate@machine:~/kubefate$ kubefate cluster install -f examples/party-10000/cluster.yaml +kubefate@machine:~/kubefate$ kubefate cluster install -f examples/party-10000/cluster-spark-pulsar.yaml create job success, job id=7752db70-e368-41fa-8827-d39411728d1b ``` @@ -356,7 +440,7 @@ or watch the clusters till their STATUS changing to `Running`: ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -51476469-b473-4d41-b2d5-ea7241d5eac7 fate-9999 fate-9999 1 Running fate v1.9.0 88s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.9.0 88s dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.9.0 69s ``` We have about 10G Docker images that need to be pulled, this step will take a while for the first time. @@ -370,111 +454,142 @@ kubectl get po -n fate-10000 When finished applying the image, the result will be similar to this, ``` -NAME READY STATUS RESTARTS AGE -clustermanager-bcfc6866d-nfs6c 1/1 Running 0 12m -mysql-c77b7b94b-zblt5 1/1 Running 0 12m -nodemanager-0-5599db57f4-2khcg 2/2 Running 0 12m -nodemanager-1-7c986f9454-qcscd 2/2 Running 0 12m -python-57b66d96bd-vj8kq 3/3 Running 0 12m -rollsite-7846898d6d-j2gb9 1/1 Running 0 12m +NAME READY STATUS RESTARTS AGE +client-0 1/1 Running 0 53m +datanode-0 1/1 Running 0 53m +datanode-1 1/1 Running 0 40m +datanode-2 1/1 Running 0 40m +mysql-0 1/1 Running 0 53m +namenode-0 1/1 Running 0 53m +nginx-75b7565846-kpj86 1/1 Running 5 53m +pulsar-0 1/1 Running 1 53m +python-0 2/2 Running 0 53m +spark-master-fc67d9b57-99sjx 1/1 Running 1 53m +spark-worker-f74f94fdb-44248 1/1 Running 1 53m +spark-worker-f74f94fdb-bx2jv 1/1 Running 1 53m ``` ### Verify the deployment From above `kubefate cluster ls` command, we know the cluster UUID of `fate-9999` is -`51476469-b473-4d41-b2d5-ea7241d5eac7`, while cluster UUID of `fate-10000` is `dacc0549-b9fc-463f-837a-4e7316db2537`. +`29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3`, while cluster UUID of `fate-10000` is `dacc0549-b9fc-463f-837a-4e7316db2537`. Thus, we can query there access information by: ``` -kubefate@machine:~/demo$ kubefate cluster describe 51476469-b473-4d41-b2d5-ea7241d5eac7 -UUID 51476469-b473-4d41-b2d5-ea7241d5eac7 -Name fate-9999 -NameSpace fate-9999 -ChartName fate -ChartVersion v1.9.0 -Revision 1 -Age 15h -Status Running -Spec backend: eggroll - chartName: fate - chartVersion: v1.9.0 - imagePullSecrets: - - name: myregistrykey - imageTag: 1.9.0-release - ingress: - client: - annotations: - kubernetes.io/ingress.class: nginx - hosts: - - name: party9999.notebook.example.com - fateboard: - annotations: - kubernetes.io/ingress.class: nginx - hosts: - - name: party9999.fateboard.example.com - istio: - enabled: false - modules: - - rollsite - - clustermanager - - nodemanager - - mysql - - python - - fateboard - - client - name: fate-9999 - namespace: fate-9999 - partyId: 9999 - persistence: false - podSecurityPolicy: - enabled: false - pullPolicy: null - python: - grpcNodePort: 30092 - httpNodePort: 30097 - type: NodePort - registry: "" - rollsite: - nodePort: 30091 - partyList: - - partyId: 10000 - partyIp: 10.192.173.64 - partyPort: 30101 - type: NodePort - servingIp: 10.192.173.64 - servingPort: 30095 - -Info dashboard: - - party9999.notebook.example.com - - party9999.fateboard.example.com - ip: 10.192.173.64 - port: 30091 - status: - containers: - client: Running - clustermanager: Running - fateboard: Running - mysql: Running - nodemanager-0: Running - nodemanager-0-eggrollpair: Running - nodemanager-1: Running - nodemanager-1-eggrollpair: Running - python: Running - rollsite: Running - deployments: - client: Available - clustermanager: Available - mysql: Available - nodemanager-0: Available - nodemanager-1: Available - python: Available - rollsite: Available -``` -In `Info->dashboard` field, we can see there are two dashboards in the current deployment: +kubefate@machine:~/demo$ kubefate cluster describe 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 +UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 +Name fate-9999 +NameSpace fate-9999 +ChartName fate +ChartVersion v1.9.0 +Revision 1 +Age 54m +Status Running +Spec algorithm: Basic + chartName: fate + chartVersion: v1.9.0 + computing: Spark + device: CPU + federation: Pulsar + imagePullSecrets: + - name: myregistrykey + ingress: + client: + hosts: + - name: party9999.notebook.example.com + fateboard: + hosts: + - name: party9999.fateboard.example.com + pulsar: + hosts: + - name: party9999.pulsar.example.com + spark: + hosts: + - name: party9999.spark.example.com + ingressClassName: nginx + istio: + enabled: false + modules: + - python + - mysql + - fateboard + - client + - spark + - hdfs + - nginx + - pulsar + name: fate-9999 + namespace: fate-9999 + nginx: + grpcNodePort: 30098 + httpNodePort: 30093 + route_table: + "10000": + fateflow: + - grpc_port: 30108 + host: 10.182.137.144 + http_port: 30103 + type: NodePort + partyId: 9999 + persistence: false + podSecurityPolicy: + enabled: false + pullPolicy: null + pulsar: + httpNodePort: 30094 + httpsNodePort: 30099 + publicLB: + enabled: false + route_table: + "9999": + host: pulsar + port: 6650 + sslPort: 6651 + "10000": + host: 10.182.137.144 + port: 30104 + proxy: "" + sslPort: 30109 + type: NodePort + python: + grpcNodePort: 30092 + httpNodePort: 30097 + logLevel: INFO + type: NodePort + registry: "" + servingIp: 10.182.137.144 + servingPort: 30095 + storage: HDFS + +Info dashboard: + - party9999.notebook.example.com + - party9999.fateboard.example.com + - party9999.pulsar.example.com + - party9999.spark.example.com + ip: 10.182.134.142 + status: + containers: + client: Running + datanode: Running + fateboard: Running + fateflow: Running + mysql: Running + namenode: Running + nginx: Running + pulsar: Running + spark-master: Running + spark-worker: Running + deployments: + nginx: Available + spark-master: Available + spark-worker: Available +``` +In `Info->dashboard` field, we can see there are 4 dashboards in the current deployment: * Notebook in `party9999.notebook.example.com`, which is the Jupyter Notebook integrated, where data scientists can write python or access shell. We have pre-installed FATE-clients to the Notebook. -* FATEBoard in `party9999.fateboard.example.com`, which we can use to check the status, job flows in FATE. +* FATEBoard in `party9999.fateboard.example.com`, which we can use to check the status, job flows in FATE.\ +* Pulsar in `party9999.pulsar.example.com`, which is the UI console of Pulsar, the message queue for transferring the gradients during FML. +* Spark in `party9999.spark.example.com`, which is the UI console of Spark, -With similar command, we can see that the Notebook for `fate-10000` is `party10000.notebook.example.com`, -and the FATEBoard for `fate-10000` is `party10000.fateboard.example.com`. +With similar command, we can check the dashboards for fate-10000. ### (Optional) Configure the dashboards' URLs in hosts #### Note: if we have the dns service setup, this step can be skipped. @@ -536,8 +651,11 @@ Also the data output. This means that the job is successfully processed and KubeFate is running properly. ## Next Steps -1. The example showed above is the simplest of FATE's example. Please explore other Job examples in Notebook. -2. Now you have deployed your first FATE cluster based on eggroll. We also have prepared example YAML files +1. The example showed above is just one of FATE's examples. Please explore other Job examples in Notebook. +2. Now you have deployed your first FATE cluster based on Spark and Pulsar. We also have prepared example YAML files (https://github.com/FederatedAI/KubeFATE/tree/master/k8s-deploy/examples) for: * Deploy FATE-Serving - * Deploy Spark-based FATE cluster, and try different message queues: rabbitmq and pulsar. \ No newline at end of file + * Deploy Eggroll based FATE cluster. + * Try Rabbitmq as the message queue. + * Deploy Spark-local+LocalFS based FATE cluster. + * Deploy FATE-Exchange to deploy the star-mode federation. \ No newline at end of file diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md index 392f4921c..b0976894e 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md @@ -3,7 +3,7 @@ 我们的总体架构如下: -![总体架构](https://github.com/FederatedAI/KubeFATE/raw/master/docs/tutorials/images/goal.png) +![总体架构](../images/fate_on_spark_with_pulsar.png) 本文共出现两台机器: 1. 用来做Demo的机器,是一台Linux机器,参照前置条件第一点; @@ -22,8 +22,8 @@ export fate_version=v1.9.0 && export kubefate_version=v1.4.5 && cd ~ && mkdir de Notes: * 当我们提到"KubeFATE的版本",通常来讲会有三个概念: - * KubeFATE命令行工具的版本,在本教程中为v1.4.5,一个类似的例子是同为命令行工具的KubeCtl。 - * KubeFATE服务版本,在本教程中为v1.4.5,一个类似的例子是Kubernetes。 + * KubeFATE命令行工具的版本,在本教程中为v1.4.5。 + * KubeFATE服务版本,在本教程中为v1.4.5。 * FATE版本,在本教程中v1.9.0,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 * **下文介绍的MiniKube机器IP地址是192.168.100.123。请修改为你准备的实验机器IP地址** @@ -49,14 +49,14 @@ curl -LO https://github.com/kubernetes/minikube/releases/download/v1.19.0/miniku 验证安装结果: ``` kubefate@machine:~/demo$ minikube version -minikube version: v1.21.0 -commit: 76d74191d82c47883dc7e1319ef7cebd3e00ee11 +minikube version: v1.19.0 +commit: 15cede53bdc5fe242228853e737333b09d4336b5 ``` ### 使用MiniKube安装Kubernetes MiniKube支持使用不同的虚拟机来部署Kubernetes,但是在Linux环境下,我们建议直接使用Docker方式。更多的说明参考:[Install MiniKube - Install a Hypervisor](https://kubernetes.io/docs/tasks/tools/install-minikube/#install-a-hypervisor). ``` -sudo minikube start --vm-driver=none --kubernetes-version v1.19.0 +sudo minikube start --vm-driver=none --kubernetes-version v1.19.0 --cni=flannel ``` 待到命令执行完成,我们可以验证下, ``` @@ -71,7 +71,9 @@ kubeconfig: Configured ``` sudo minikube addons enable ingress ``` -到此,我们的Kubernetes也准备好了。 + +使用命令`kubectl get pods -A`来查看所有的状态,当所有的pod的状态都变成ready的时候,表示K8s集群已经准备就绪。 + ## 安装Kubefate ### 下载KubeFATE命令行工具 @@ -90,7 +92,7 @@ cluster-spark-pulsar.yaml cluster-spark-slim.yaml config.yaml kubefate.yaml ``` chmod +x ./kubefate && sudo mv ./kubefate /usr/bin ``` -然后我们测试下kubefate命令是否可用, +然后我们测试下KubeCLI命令是否可用, ``` kubefate@machine:~/kubefate$ kubefate version * kubefate commandLine version=v1.4.5 @@ -208,15 +210,36 @@ kubectl create namespace fate-9999 kubectl create namespace fate-10000 ``` -在exmaple目录下,我们已经预先设置了两个例子:`/kubefate/examples/party-9999/` 和 `/kubefate/examples/party-10000` -对于`/kubefate/examples/party-9999/cluster.yaml`,我们可以将其修改如下: +如果你是用dockerhub上的镜像仓库,可以先在相应的域名下注册登录信息,因为后面有些镜像需要从dockerhub下载。 + +``` +DOCKER_REGISTRY_SERVER=docker.io +DOCKER_USER= +DOCKER_PASSWORD= + +kubectl -n fate-9999 create secret docker-registry myregistrykey \ + --docker-server=$DOCKER_REGISTRY_SERVER \ + --docker-username=$DOCKER_USER \ + --docker-password=$DOCKER_PASSWORD + +kubectl -n fate-10000 create secret docker-registry myregistrykey \ + --docker-server=$DOCKER_REGISTRY_SERVER \ + --docker-username=$DOCKER_USER \ + --docker-password=$DOCKER_PASSWORD +``` + +在exmaple目录下我们有一些预设的例子:`/kubefate/examples/party-9999/` 和 `/kubefate/examples/party-10000` + +本教程中我们使用Spark+Pulsar的组合来做例子。 + +对于`/kubefate/examples/party-9999/cluster-spark-pulsar.yaml`,我们可以将其修改如下: ``` name: fate-9999 namespace: fate-9999 chartName: fate chartVersion: v1.9.0 partyId: 9999 -registry: "hub.c.163.com/federatedai" +registry: "" pullPolicy: imagePullSecrets: - name: myregistrykey @@ -227,31 +250,34 @@ podSecurityPolicy: enabled: false ingressClassName: nginx modules: - - rollsite - - clustermanager - - nodemanager - - mysql - python + - mysql - fateboard - client + - spark + - hdfs + - nginx + - pulsar -backend: eggroll +computing: Spark +federation: Pulsar +storage: HDFS +algorithm: Basic +device: CPU ingress: fateboard: hosts: - name: party9999.fateboard.example.com - client: + client: hosts: - name: party9999.notebook.example.com - -rollsite: - type: NodePort - nodePort: 30091 - partyList: - - partyId: 10000 - partyIp: 192.168.100.123 - partyPort: 30101 + spark: + hosts: + - name: party9999.spark.example.com + pulsar: + hosts: + - name: party9999.pulsar.example.com python: type: NodePort @@ -259,53 +285,84 @@ python: grpcNodePort: 30092 logLevel: INFO -servingIp: 192.168.100.123 +servingIp: 10.182.137.144 servingPort: 30095 -``` -对于`/kubefate/examples/party-10000/cluster.yaml`,我们可以将其修改如下: +nginx: + type: NodePort + httpNodePort: 30093 + grpcNodePort: 30098 + route_table: + 10000: + fateflow: + - host: 10.182.137.144 + http_port: 30103 + grpc_port: 30108 + +pulsar: + type: NodePort + httpNodePort: 30094 + httpsNodePort: 30099 + publicLB: + enabled: false + route_table: + 9999: + host: pulsar + port: 6650 + sslPort: 6651 + 10000: + host: 10.182.137.144 + port: 30104 + sslPort: 30109 + proxy: "" +``` + +对于fate-10000,我们可以将其修改如下: ``` name: fate-10000 namespace: fate-10000 chartName: fate chartVersion: v1.9.0 partyId: 10000 -registry: "hub.c.163.com/federatedai" +registry: "" pullPolicy: imagePullSecrets: - name: myregistrykey persistence: false istio: enabled: false +ingressClassName: nginx podSecurityPolicy: enabled: false -ingressClassName: nginx modules: - - rollsite - - clustermanager - - nodemanager - - mysql - python + - mysql - fateboard - client + - spark + - hdfs + - nginx + - pulsar -backend: eggroll +computing: Spark +federation: Pulsar +storage: HDFS +algorithm: Basic +device: CPU ingress: fateboard: hosts: - name: party10000.fateboard.example.com - client: + client: hosts: - name: party10000.notebook.example.com - -rollsite: - type: NodePort - nodePort: 30101 - partyList: - - partyId: 9999 - partyIp: 192.168.100.123 - partyPort: 30091 + spark: + hosts: + - name: party10000.spark.example.com + pulsar: + hosts: + - name: party10000.pulsar.example.com python: type: NodePort @@ -313,8 +370,36 @@ python: grpcNodePort: 30102 logLevel: INFO -servingIp: 192.168.100.123 +servingIp: 10.182.137.144 servingPort: 30105 + +nginx: + type: NodePort + httpNodePort: 30103 + grpcNodePort: 30108 + route_table: + 9999: + fateflow: + - host: 10.182.137.144 + http_port: 30093 + grpc_port: 30098 + +pulsar: + type: NodePort + httpNodePort: 30104 + httpsNodePort: 30109 + publicLB: + enabled: false + route_table: + 9999: + host: 10.182.137.144 + port: 30094 + sslPort: 30099 + proxy: "" + 10000: + host: pulsar + port: 6650 + sslPort: 6651 ``` **注意: 我们强烈建议阅读以下文档** @@ -323,9 +408,9 @@ servingPort: 30105 ### 安装FATE集群 如果一切没有问题,那就可以使用`kubefate cluster install`来部署两个fate集群了, ``` -kubefate@machine:~/kubefate$ kubefate cluster install -f examples/party-9999/cluster.yaml +kubefate@machine:~/kubefate$ kubefate cluster install -f examples/party-9999/cluster-spark-pulsar.yaml create job success, job id=2c1d926c-bb57-43d3-9127-8cf3fc6deb4b -kubefate@machine:~/kubefate$ kubefate cluster install -f examples/party-10000/cluster.yaml +kubefate@machine:~/kubefate$ kubefate cluster install -f examples/party-10000/cluster-spark-pulsar.yaml create job success, job id=7752db70-e368-41fa-8827-d39411728d1b ``` @@ -333,7 +418,7 @@ create job success, job id=7752db70-e368-41fa-8827-d39411728d1b ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -51476469-b473-4d41-b2d5-ea7241d5eac7 fate-9999 fate-9999 1 Running fate v1.9.0 88s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.9.0 88s dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.9.0 69s ``` 因为这个步骤需要到网易云镜像仓库去下载约10G的镜像,所以第一次执行视乎你的网络情况需要一定时间。 @@ -354,97 +439,123 @@ rollsite-7846898d6d-j2gb9 1/1 Running 0 12m ``` ### 验证FATE的部署 -通过以上的 `kubefate cluster ls` 命令, 我们得到 `fate-9999` 的集群ID是 `51476469-b473-4d41-b2d5-ea7241d5eac7`, 而 `fate-10000` 的集群ID是 `dacc0549-b9fc-463f-837a-4e7316db2537`. 我们可以通过`kubefate cluster describe`查询集群的具体访问信息, -``` -kubefate@machine:~/demo$ kubefate cluster describe 51476469-b473-4d41-b2d5-ea7241d5eac7 -UUID 51476469-b473-4d41-b2d5-ea7241d5eac7 -Name fate-9999 -NameSpace fate-9999 -ChartName fate -ChartVersion v1.9.0 -Revision 1 -Age 15h -Status Running -Spec backend: eggroll - chartName: fate - chartVersion: v1.9.0 - imagePullSecrets: - - name: myregistrykey - imageTag: 1.9.0-release - ingress: - client: - annotations: - kubernetes.io/ingress.class: nginx - hosts: - - name: party9999.notebook.example.com - fateboard: - annotations: - kubernetes.io/ingress.class: nginx - hosts: - - name: party9999.fateboard.example.com - istio: - enabled: false - modules: - - rollsite - - clustermanager - - nodemanager - - mysql - - python - - fateboard - - client - name: fate-9999 - namespace: fate-9999 - partyId: 9999 - persistence: false - podSecurityPolicy: - enabled: false - pullPolicy: null - python: - grpcNodePort: 30092 - httpNodePort: 30097 - type: NodePort - registry: "" - rollsite: - nodePort: 30091 - partyList: - - partyId: 10000 - partyIp: 10.192.173.64 - partyPort: 30101 - type: NodePort - servingIp: 10.192.173.64 - servingPort: 30095 - -Info dashboard: - - party9999.notebook.example.com - - party9999.fateboard.example.com - ip: 10.192.173.64 - port: 30091 - status: - containers: - client: Running - clustermanager: Running - fateboard: Running - mysql: Running - nodemanager-0: Running - nodemanager-0-eggrollpair: Running - nodemanager-1: Running - nodemanager-1-eggrollpair: Running - python: Running - rollsite: Running - deployments: - client: Available - clustermanager: Available - mysql: Available - nodemanager-0: Available - nodemanager-1: Available - python: Available - rollsite: Available +通过以上的 `kubefate cluster ls` 命令, 我们得到 `fate-9999` 的集群ID是 `29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3`, 而 `fate-10000` 的集群ID是 `dacc0549-b9fc-463f-837a-4e7316db2537`. 我们可以通过`kubefate cluster describe`查询集群的具体访问信息, +``` +kubefate@machine:~/demo$ kubefate cluster describe 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 +UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 +Name fate-9999 +NameSpace fate-9999 +ChartName fate +ChartVersion v1.9.0 +Revision 1 +Age 54m +Status Running +Spec algorithm: Basic + chartName: fate + chartVersion: v1.9.0 + computing: Spark + device: CPU + federation: Pulsar + imagePullSecrets: + - name: myregistrykey + ingress: + client: + hosts: + - name: party9999.notebook.example.com + fateboard: + hosts: + - name: party9999.fateboard.example.com + pulsar: + hosts: + - name: party9999.pulsar.example.com + spark: + hosts: + - name: party9999.spark.example.com + ingressClassName: nginx + istio: + enabled: false + modules: + - python + - mysql + - fateboard + - client + - spark + - hdfs + - nginx + - pulsar + name: fate-9999 + namespace: fate-9999 + nginx: + grpcNodePort: 30098 + httpNodePort: 30093 + route_table: + "10000": + fateflow: + - grpc_port: 30108 + host: 10.182.137.144 + http_port: 30103 + type: NodePort + partyId: 9999 + persistence: false + podSecurityPolicy: + enabled: false + pullPolicy: null + pulsar: + httpNodePort: 30094 + httpsNodePort: 30099 + publicLB: + enabled: false + route_table: + "9999": + host: pulsar + port: 6650 + sslPort: 6651 + "10000": + host: 10.182.137.144 + port: 30104 + proxy: "" + sslPort: 30109 + type: NodePort + python: + grpcNodePort: 30092 + httpNodePort: 30097 + logLevel: INFO + type: NodePort + registry: "" + servingIp: 10.182.137.144 + servingPort: 30095 + storage: HDFS + +Info dashboard: + - party9999.notebook.example.com + - party9999.fateboard.example.com + - party9999.pulsar.example.com + - party9999.spark.example.com + ip: 10.182.134.142 + status: + containers: + client: Running + datanode: Running + fateboard: Running + fateflow: Running + mysql: Running + namenode: Running + nginx: Running + pulsar: Running + spark-master: Running + spark-worker: Running + deployments: + nginx: Available + spark-master: Available + spark-worker: Available ``` 从返回的内容中,我们看到`Info->dashboard`里包含了: * Jupyter Notebook的访问地址: `party9999.notebook.example.com`。这个是我们准备让数据科学家进行建模分析的平台。已经集成了FATE-Clients; * FATEBoard的访问地址: `party9999.fateboard.example.com`。我们可以通过FATEBoard来查询当前训练的状态。 +* Pulsar的UI地址,`party9999.pulsar.example.com`,可以使用此地址查看Pulsar的UI界面。 +* Spark的UI地址,`party9999.spark.example.com`,可以使用此地址查看Spark的UI界面。 -类似的命令我们得到,`fate-10000`的Jupyter Notebook和FATEBoard地址分别是:`party10000.notebook.example.com` 以及`party10000.fateboard.example.com`。 +遇此类推,我们也可以查看fate-10000的各种地址 ### 在浏览器访问FATE集群的机器上配置相关的Host信息 **注意: 如果DNS已经配置了相关的解析,这步可以跳过** @@ -504,6 +615,8 @@ notebook截图: ## 后续 1. 上面的联邦学习例子是最简单的一个例子,请自行探索其他的例子。 -2. 本文只介绍了基于eggroll的FATE集群架构,在`https://github.com/FederatedAI/KubeFATE/tree/master/k8s-deploy/examples`目录中,我们还准备了其他的YAML文件,用来: +2. 本文只介绍了基于Spark和Pulsar的FATE集群架构,在`https://github.com/FederatedAI/KubeFATE/tree/master/k8s-deploy/examples`目录中,我们还准备了其他的YAML文件,用来: * 部署FATE-Serving。 -* 部署基于Spark的FATE集群,以及基于两种消息队列的FATE集群:rabbitmq和pulsar。 \ No newline at end of file +* 部署基于Eggroll的FATE集群。 +* 部署Rabbitmq做消息队列的FATE集群。 +* 部署FATE-Exchange并尝试星型的联邦。 \ No newline at end of file From 08ccb4bf88d925d6a6a1d84820e6298bb80ff3a0 Mon Sep 17 00:00:00 2001 From: Chen Jing Date: Tue, 30 Aug 2022 10:49:52 +0800 Subject: [PATCH 2/3] address comments Signed-off-by: Chen Jing --- ...ster_in_One_Linux_Machine_with_MiniKube.md | 20 +++++++++---------- ...r_in_One_Linux_Machine_with_MiniKube_zh.md | 18 ++++++++--------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md index 920fddd16..3d31c1678 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md @@ -304,7 +304,7 @@ python: grpcNodePort: 30092 logLevel: INFO -servingIp: 10.182.137.144 +servingIp: 192.168.100.123 servingPort: 30095 nginx: @@ -314,7 +314,7 @@ nginx: route_table: 10000: fateflow: - - host: 10.182.137.144 + - host: 192.168.100.123 http_port: 30103 grpc_port: 30108 @@ -330,7 +330,7 @@ pulsar: port: 6650 sslPort: 6651 10000: - host: 10.182.137.144 + host: 192.168.100.123 port: 30104 sslPort: 30109 proxy: "" @@ -388,7 +388,7 @@ python: grpcNodePort: 30102 logLevel: INFO -servingIp: 10.182.137.144 +servingIp: 192.168.100.123 servingPort: 30105 nginx: @@ -398,7 +398,7 @@ nginx: route_table: 9999: fateflow: - - host: 10.182.137.144 + - host: 192.168.100.123 http_port: 30093 grpc_port: 30098 @@ -410,7 +410,7 @@ pulsar: enabled: false route_table: 9999: - host: 10.182.137.144 + host: 192.168.100.123 port: 30094 sslPort: 30099 proxy: "" @@ -525,7 +525,7 @@ Spec algorithm: Basic "10000": fateflow: - grpc_port: 30108 - host: 10.182.137.144 + host: 192.168.100.123 http_port: 30103 type: NodePort partyId: 9999 @@ -544,7 +544,7 @@ Spec algorithm: Basic port: 6650 sslPort: 6651 "10000": - host: 10.182.137.144 + host: 192.168.100.123 port: 30104 proxy: "" sslPort: 30109 @@ -555,7 +555,7 @@ Spec algorithm: Basic logLevel: INFO type: NodePort registry: "" - servingIp: 10.182.137.144 + servingIp: 192.168.100.123 servingPort: 30095 storage: HDFS @@ -585,7 +585,7 @@ Info dashboard: In `Info->dashboard` field, we can see there are 4 dashboards in the current deployment: * Notebook in `party9999.notebook.example.com`, which is the Jupyter Notebook integrated, where data scientists can write python or access shell. We have pre-installed FATE-clients to the Notebook. -* FATEBoard in `party9999.fateboard.example.com`, which we can use to check the status, job flows in FATE.\ +* FATEBoard in `party9999.fateboard.example.com`, which we can use to check the status, job flows in FATE. * Pulsar in `party9999.pulsar.example.com`, which is the UI console of Pulsar, the message queue for transferring the gradients during FML. * Spark in `party9999.spark.example.com`, which is the UI console of Spark, diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md index b0976894e..be4a6ee97 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md @@ -285,7 +285,7 @@ python: grpcNodePort: 30092 logLevel: INFO -servingIp: 10.182.137.144 +servingIp: 192.168.100.123 servingPort: 30095 nginx: @@ -295,7 +295,7 @@ nginx: route_table: 10000: fateflow: - - host: 10.182.137.144 + - host: 192.168.100.123 http_port: 30103 grpc_port: 30108 @@ -311,7 +311,7 @@ pulsar: port: 6650 sslPort: 6651 10000: - host: 10.182.137.144 + host: 192.168.100.123 port: 30104 sslPort: 30109 proxy: "" @@ -370,7 +370,7 @@ python: grpcNodePort: 30102 logLevel: INFO -servingIp: 10.182.137.144 +servingIp: 192.168.100.123 servingPort: 30105 nginx: @@ -380,7 +380,7 @@ nginx: route_table: 9999: fateflow: - - host: 10.182.137.144 + - host: 192.168.100.123 http_port: 30093 grpc_port: 30098 @@ -392,7 +392,7 @@ pulsar: enabled: false route_table: 9999: - host: 10.182.137.144 + host: 192.168.100.123 port: 30094 sslPort: 30099 proxy: "" @@ -492,7 +492,7 @@ Spec algorithm: Basic "10000": fateflow: - grpc_port: 30108 - host: 10.182.137.144 + host: 192.168.100.123 http_port: 30103 type: NodePort partyId: 9999 @@ -511,7 +511,7 @@ Spec algorithm: Basic port: 6650 sslPort: 6651 "10000": - host: 10.182.137.144 + host: 192.168.100.123 port: 30104 proxy: "" sslPort: 30109 @@ -522,7 +522,7 @@ Spec algorithm: Basic logLevel: INFO type: NodePort registry: "" - servingIp: 10.182.137.144 + servingIp: 192.168.100.123 servingPort: 30095 storage: HDFS From 908a2547403e22b728bcb1153d6d494506acd59c Mon Sep 17 00:00:00 2001 From: Chen Jing Date: Tue, 30 Aug 2022 10:56:31 +0800 Subject: [PATCH 3/3] modify ip address Signed-off-by: Chen Jing --- ...o_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md | 2 +- ...arties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md index 3d31c1678..0b32f61a9 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md @@ -564,7 +564,7 @@ Info dashboard: - party9999.fateboard.example.com - party9999.pulsar.example.com - party9999.spark.example.com - ip: 10.182.134.142 + ip: 192.168.100.124 status: containers: client: Running diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md index be4a6ee97..fba441145 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md @@ -531,7 +531,7 @@ Info dashboard: - party9999.fateboard.example.com - party9999.pulsar.example.com - party9999.spark.example.com - ip: 10.182.134.142 + ip: 192.168.100.124 status: containers: client: Running